这是个复杂的问题,求大佬指点

喵喵在开发一个新式的社交网络,现在她想要写一个模块来处理文本并计算其中不同标签的数量。
文本由小写英文字母,数字,符号 # 以及空格四种字符组成。我们把若干个连续的非空格字符定义成一个词,词的左右两边要么是空格要么就是文本的开头或者结尾。若一个词的长度大于 1 ,第一个字符是 # 而其它的字符都不是 #,那我们称这个词是一个标签。

现在喵喵想计算文本中出现的不同标签的数量,并将其打印出来。

输入格式:
在一行中输入一个长度不超过 10^5
的字符串 str 表示要处理的文本

输出格式:
先在一行中输出一个整数 n,代表文本中出现的不同标签的个数

接下来 n 行,每行打印一个标签和这个标签出现的次数
标签可以按任何顺序打印

输入样例:

i have brought #peace #freedom #justice and #security to my new empire

输出样例:

4
#freedom 1
#justice 1
#peace 1
#security 1

注意前面有 #
最后一个没有换行

阅读 2.2k
2 个回答

讲真用 c 写就很烦, hashmap 这部分我简单的糊弄了,如果标签数量大于 capacity 会有问题,你可以替换一个完整的

#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct Lexer Lexer_;
Lexer_ lexer;

typedef enum {
  TOKEN_NUMBER_SIGN = 1,
  TOKEN_TAG,
  TOKEN_EOF,
  TOKEN_UNKNOWN
} TokenType;

struct Lexer {
  const char *start;
  const char *current;
};

typedef struct {
  TokenType type;
  const char *start;
  int length;

} Token;

void init(const char *source) {
  lexer.start = source;
  lexer.current = source;
}

bool is_at_end() { return *lexer.current == '\0'; }
bool is_alpha_numeric(char c) {
  return (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
}

char peek() { return *lexer.current; }

char get_next_char() {
  lexer.current++;
  return lexer.current[-1];
}

void eat_whitespace() {
  while (1) {
    char c = peek();
    switch (c) {
    case ' ':
    case '\r':
    case '\t':
      get_next_char();
      break;
    default:
      return;
    }
  }
}

Token error_token(const char *message) {
  Token token;
  token.type = TOKEN_NUMBER_SIGN;
  token.start = message;
  token.length = (int)strlen(message);
  return token;
}

Token make_token(TokenType type) {
  Token token;
  token.type = type;
  token.start = lexer.start;
  token.length = (int)(lexer.current - lexer.start);
  return token;
}

Token tag() {
  bool hasNumberSign = false;
  while (is_alpha_numeric(peek()) || peek() == '#') {
    char c = get_next_char();
    if (c == '#')
      hasNumberSign = true;
  }
  return make_token(hasNumberSign ? TOKEN_UNKNOWN : TOKEN_TAG);
}

Token get_token() {
  eat_whitespace();
  lexer.start = lexer.current;
  if (is_at_end())
    return make_token(TOKEN_EOF);
  char c = get_next_char();
  if (is_alpha_numeric(c))
    return tag();
  switch (c) {
  case '#':
    return make_token(TOKEN_NUMBER_SIGN);
  }
  return error_token("Unexpected character.");
}

uint32_t hash_string(const char *key, int length) {
  uint32_t hash = 2166136261u;
  for (int i = 0; i < length; i++) {
    hash ^= (uint8_t)key[i];
    hash *= 16777619;
  }
  return hash;
}

int total = 0;
int capacity = 10;
typedef struct {
  char *key;
  int count;
  uint32_t hash;
} TagEntry;

TagEntry *entries;

TagEntry *find_entry(uint32_t hash) {
  uint32_t index = hash % capacity;
  for (;;) {
    TagEntry *entry = &entries[index];
    if (entry->hash == hash || entry->key == NULL)
      return entry;
    index = (index + 1) % capacity;
  }
}

Token current;
Token previouse;

void get_next_token() {
  previouse = current;
  current = get_token();
}

void parse(const char *source) {
  init(source);
  entries = (TagEntry *)realloc(NULL, capacity * sizeof(TagEntry));
  while (1) {
    get_next_token();
    if (previouse.type == TOKEN_EOF)
      break;
    if (previouse.type == TOKEN_NUMBER_SIGN) {
      get_next_token();
      if (previouse.type == TOKEN_TAG) {
        uint32_t hash = hash_string(previouse.start, previouse.length);
        TagEntry *entry = find_entry(hash);
        if (entry->key == NULL) {
          entry->hash = hash;
          entry->key = (char *)realloc(NULL, previouse.length + 1);
          memcpy(entry->key, previouse.start, previouse.length);
          entry->key[previouse.length] = '\0';
          entry->count = 1;
        } else {
          entry->count++;
        }
        total++;
      }
    }
  }

  printf("%d\n", total);
  for (int i = 0; i < capacity; i++) {
    TagEntry *entry = &entries[i];
    if (entry->count > 0) {
      printf("#%s %d\n", entry->key, entry->count);
    }
  }
}

int main() {
  const char *source = "i #have #brou#ght #peace #peace #freedom #justice and "
                       "#security to my new empire";

  parse(source);
  return 0;
}

c++ 修改版


#include <string.h>

#include <iostream>
#include <map>

typedef enum {
  TOKEN_NUMBER_SIGN = 1,
  TOKEN_TAG,
  TOKEN_EOF,
  TOKEN_UNKNOWN
} TokenType;

typedef struct {
  TokenType type;
  const char *start;
  int length;
} Token;

class Lexer {
 public:
  Lexer(const char *source)
      : source_(source), start_(source), current_(source) {}
  Token get_next_token() {
    eat_whitespace();
    start_ = current_;
    if (is_at_end()) return make_token(TOKEN_EOF);
    char c = get_next_char();
    if (is_alpha_numeric(c)) return tag();
    switch (c) {
      case '#':
        return make_token(TOKEN_NUMBER_SIGN);
    }
    return error_token("Unexpected character.");
  }

 private:
  bool is_at_end() { return *current_ == '\0'; }
  bool is_alpha_numeric(char c) {
    return (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
  }
  char peek() { return *current_; }
  char get_next_char() {
    current_++;
    return current_[-1];
  }
  void eat_whitespace() {
    while (1) {
      char c = peek();
      switch (c) {
        case ' ':
        case '\r':
        case '\t':
          get_next_char();
          break;
        default:
          return;
      }
    }
  }

  Token error_token(const char *message) {
    Token token;
    token.type = TOKEN_NUMBER_SIGN;
    token.start = message;
    token.length = (int)strlen(message);
    return token;
  }
  Token make_token(TokenType type) {
    Token token;
    token.type = type;
    token.start = start_;
    token.length = (int)(current_ - start_);
    return token;
  }
  Token tag() {
    bool hasNumberSign = false;
    while (is_alpha_numeric(peek()) || peek() == '#') {
      char c = get_next_char();
      if (c == '#') hasNumberSign = true;
    }
    return make_token(hasNumberSign ? TOKEN_UNKNOWN : TOKEN_TAG);
  }

 private:
  const char *start_;
  const char *current_;
  const char *source_;
};

void parse(const char *source) {
  Lexer lexer(source);
  Token current;
  Token previouse;
  auto get_next_token = [&]() {
    previouse = current;
    current = lexer.get_next_token();
  };
  std::map<std::string, int> tags;
  while (1) {
    get_next_token();
    if (previouse.type == TOKEN_EOF) break;
    if (previouse.type == TOKEN_NUMBER_SIGN) {
      get_next_token();
      if (previouse.type == TOKEN_TAG && previouse.length > 1) {
        std::string key = std::string(previouse.start, previouse.length);
        if (tags.find(key) == tags.end())
          tags[key] = 1;
        else
          tags[key]++;
      }
    }
  }
  std::cout << tags.size() << std::endl;
  for (auto &tag : tags) {
    std::cout << "#" << tag.first << " " << tag.second << std::endl;
  }
}

int main() {
  const char *source =
      "i have brought #peace #freedom #justice and #security to my new empire";
  parse(source);
  return 0;
}

建议使用状态机实现,这题考查的是编译原理里的分词部分

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题