Question

给定无限字符流和字符串列表L，创建一个函数，在处理流时识别L中的单词时调用外部API。

实施例： L = [“ok”，“test”，“one”，“try”，“trying”]

stream = a，b，c，o，k，d，e，f，t，r，y，i，n，g .............

当遇到'k'时，会发生对外部API的调用，遇到'y'时再次调用，再次出现'g'。

我的想法：在线性时间内从流中读取时，从列表中创建trie并导航节点。但是如果你只是做一个简单的特里search就会有一个错误。

假设你有单词“abxyz”和“xyw”，你的输入是“abxyw”。在这种情况下，你无法用trie识别“xyw”。

所以搜索应该修改如下：

让我们以上用例“abxyw”。我们开始搜索，我们发现我们拥有直到'x'的所有元素。你得到'x'的时刻你有两个选择：

检查当前元素是否等于trie的头部，如果它等于trie的头部，则调用递归搜索。
继续直到当前单词的结尾。在这种情况下，对于您的给定输入，它将返回false，但对于我们从第1点开始的递归搜索，它将返回true。

以下是我修改后的搜索，但我认为它有错误，可以改进。有什么建议吗？

#define SIZE 26
struct tri{
    int complete;
    struct tri *child[SIZE];
};

void insert(char *c, struct tri **t)
{
    struct tri *current = *t;
    while(*c != '\0')
    {
        int i;
        int letter = *c - 'a';
        if(current->child[letter] == NULL) {
            current->child[letter] = malloc(sizeof(*current));
            memset(current->child[letter], 0, sizeof(struct tri));
        }
        current = current->child[letter];
        c++;
    }
    current->complete = 1;
}

struct tri *t;
int flag = 0;
int found(char *c, struct tri *tt)
{
    struct tri *current = tt;

    if (current == NULL)
        return 0;
    while(*c != '\0')
    {
        int i;
        int letter = *c - 'a';
        /* if this is the first char then recurse from begining*/
        if (t->child[letter] != NULL)
            flag = found(c+1, t->child[letter]);
        if (flag == 1)
            return 1;
        if(!flag && current->child[letter] == NULL) {
            return 0;
        }
        current = current->child[letter];
        c++;
    }
    return current->complete;
}

int main()
{
    int i;
    t = malloc(sizeof(*t));
    t->complete = 0;
    memset(t, 0, sizeof(struct tri));

    insert("weathez", &t);
    insert("eather", &t);
    insert("weather", &t);
    (1 ==found("weather", t))?printf("found\n"):printf("not found\n");
    return 0;
}

Answer 1

您想要做的事情正是Aho-Corasick algorithm所做的。

你可以看看我的Aho-Corasick实现。它是以比赛为导向的，所以可能没有专注于可读性，但我认为很清楚：

typedef vector<int> VI;

struct Node {
  int size;
  Node *fail, *output;
  VI id;
  map<char, Node*> next;
};

typedef pair<Node*, Node*> P;
typedef map<char, Node*> MCP;

Node* root;

inline void init() {
  root = new Node;
  root->size = 0;
  root->output = root->fail = NULL;
}

Node* add(string& s, int u, int c = 0, Node* p = root) {
  if (p == NULL) {
    p = new Node;
    p->size = c;
    p->fail = p->output = NULL;
  }
  if (c == s.size()) p->id.push_back(u);
  else {
    if (not p->next.count(s[c])) p->next[s[c]] = NULL;
    p->next[s[c]] = add(s, u, c + 1, p->next[s[c]]);
  }
  return p;
}

void fill_fail_output() {
  queue<pair<char, P> > Q;
  for (MCP::iterator it=root->next.begin();
       it!=root->next.end();++it)
    Q.push(pair<char, P> (it->first, P(root, it->second)));
  while (not Q.empty()) {
    Node *pare = Q.front().second.first;
    Node *fill = Q.front().second.second;
    char c = Q.front().first; Q.pop();
    while (pare != root && !pare->fail->next.count(c))
      pare=pare->fail;
    if (pare == root) fill->fail = root;
    else fill->fail = pare->fail->next[c];
    if (fill->fail->id.size() != 0) 
      fill->output = fill->fail;
    else fill->output = fill->fail->output;
    for (MCP::iterator it=fill->next.begin();
         it!=fill->next.end();++it)
        Q.push(pair<char,P>(it->first,P(fill,it->second)));
  }
}

void match(int c, VI& id) {
  for (int i = 0; i < id.size(); ++i) {
    cout << "Matching of pattern " << id[i];
    cout << " ended at " << c << endl;
  }
}

void search(string& s) {
  int i = 0, j = 0;
  Node *p = root, *q;
  while (j < s.size()) {
    while (p->next.count(s[j])) {
      p = p->next[s[j++]];
      if (p->id.size() != 0) match(j - 1, p->id);
      q = p->output;
      while (q != NULL) {
        match(j - 1, q->id);
        q = q->output;
      }
    }
    if (p != root) {
      p = p->fail;
      i = j - p->size;
    }
    else i = ++j;
  }
}

void erase(Node* p = root) {
  for (MCP::iterator it = p->next.begin(); 
       it != p->next.end(); ++it)
    erase(it->second);
  delete p;
}

int main() {
  init();
  int n;
  cin >> n;
  for (int i = 0; i < n; ++i) {
    string s;
    cin >> s;
    add(s, i);
  }
  fill_fail_output();
  string text;
  cin >> text;
  search(text);
  erase(root);
}

在流中找到这个词？

1 个答案: