使用Aho-Corasick算法检查一组文本S,以查看它们是否至少包含模式集T的一个元素

时间:2019-07-12 17:49:58

标签: c++ design-patterns matching trie aho-corasick

我有一个建议,通过控制台可以得到一组字符串模式T(最大长度80)和一组文本(最大长度250)。目的是使用Aho-Corasick algorithm来检查哪些文本包含至少一种给定的模式,然后按照给定的顺序输出所有的文本。所有使用的字符串(模式和文本)仅包含可打印的ASCII字符,因此字母大小为95。

用户首先输入整数n个(最多1000个)模式,然后输入n个模式,然后输入任意数量的文本字符串。

课程辅导员为我们提供了一个Trie实现,其中包含一些已创建的方法(例如,一种在Aho-Corasick DFA中的节点之间进行转换的方法,以及一种用于计算节点v的故障链接的方法)。我添加了一个construct_slinks()方法,该方法在将所有模式都添加到其中并通过其功能计算故障链接之后,将通过该方法。我还添加了一个search_string()函数,该函数使用上述DFA转换方法沿文本元素(从S开始)移动,并且如果在文本中的某个位置匹配了模式,则应该返回true。

到目前为止,太好了。我检查过的所有测试用例都使用我的代码工作,但是我提交代码的自动检查系统仍然显示“ WRONG ANSWER”,因此我必须做错了什么。问题是我对Aho-Corasick还不是很熟悉,也看不到我哪里出了问题。如果有人可以查看我的代码,或者甚至找到严格不返回应有的测试用例,我将感到非常高兴。我已尽可能评论了。

伪代码贯穿整个过程也将大有帮助!

代码

#include <iostream>
#include <algorithm>
#include <vector>
#include <string>
#include <cstring>
#include <queue>
using namespace std;
const int MAXN = 1000, NMAX = 80000, K = 95; //1000 patterns, 80 (pattern length)*1000 = 80000 maximal states, 95 printable chars in ASCII
int n;  //Number of patterns
queue<int> vq;

struct vertex {
  int next[K]; // child links
  bool leaf;   // marker that a word ends here
  int p;       // parent link
  char pch;    // next[pch] = current node
  int link;    // suffix link
  int go[K];   // node to go to for given character in DFA
};

vertex t[NMAX+1]; // array nodes are stored in
int sz;           // index of next free space for node 

void init() {                               //initialize root node (0)
  t[0].p = t[0].link = -1;
  memset(t[0].next, 255, sizeof t[0].next);
  memset(t[0].go, 255, sizeof t[0].go);
  sz = 1;
}

int get_link(int v);

void add_string(const string &s) {          //Add string to trie
  int v = 0;
  for (int i = 0; i < s.length(); i++) {
    char c = s[i]-32;                               //-32 to skip first 31 unprintable ascii chars -> So space (ASCII 32) is 0 in array
    if (t[v].next[c] == -1) {
      memset(t[sz].next, 255, sizeof t[sz].next);
      memset(t[sz].go, 255, sizeof t[sz].go);
      t[sz].link = -1;
      t[sz].p = v;
      t[sz].pch = c;
      vq.push(sz);                                  //Add child to queue for suffix link construction
      t[v].next[c] = sz++;
    }
    v = t[v].next[c];                               //Pick newly created child as current node
  }
  t[v].leaf = true;                                 //For the last node: Pattern ends here, so leaf = true.
}

int go(int v, char c);

int get_link(int v) {                       //Failure link
  if (t[v].link == -1)                      //Link is not computed already
    if (v == 0 || t[v].p == 0)              //If root or parent is root -> Link = 0
      t[v].link = 0;
    else
      t[v].link = go(get_link(t[v].p), t[v].pch+32);    //Otherwise, follow failure link of the parent node using the char on the edge between. (+32 to balance out -32 in go method)
  return t[v].link;
}

int go(int v, char c) {                     //Go to next node in DFA
    c-=32;                                  //-32 to skip the first 31 unprintable ASCII letters
    if (t[v].go[c] == -1) {                 //Hasnt been computed yet - Compute it
            if (t[v].next[c] != -1) {       //Direct child with corresponding char -> Next node is the child node
                t[v].go[c] = t[v].next[c];
        }
            else {                          //No direct child with corresponding char -> Follow failure link
                t[v].go[c] = (v == 0) ? 0 : go(get_link(v), c+32);  //c+=32 because we already subtract 32 from it at method start (balancing it out)
        }
    }
    return t[v].go[c];
}

bool search_string(const string &s) {       //Follow along the "go"-links
    int v = 0;
    for (char c : s) {
        v = go(v,c);
        if (t[v].leaf) { //Pattern ends at current node
            return true;
        }
    }
    return false;
}

void construct_slinks() {   //Construct failure (suffix) links with a top-down approach using the queue we stacked
    while (!vq.empty()) {   //Run DFS on the trie, recursively calculate links
        int curr = vq.front();
        vq.pop();
        int link = get_link(curr);
        if (t[get_link(curr)].leaf)
            t[curr].leaf = true; //If failure link of current node points to a leaf node (where a pattern ends), make current node a leaf too
    }
}

int main() {
    int n; cin >> n; cin.ignore();  //Read number of patterns
    vector<string> trueones;        //For output
    init();
    vq.push(0);
    for (int i = 0; i < n; i++){
        string s;
        getline(cin, s);
        add_string(s);              //Build tree out of patterns
    }

    construct_slinks();             //Construct suffix links
    string temp;
    getline(cin, temp);
    while ( temp != "") {           //Read unknown number of strings and run search on them
        if (search_string(temp))
            trueones.push_back(temp);   //if a pattern is in string "temp", push temp to trueones
        getline(cin, temp);
    }

    for (auto x: trueones) {        //Output all strings in trueones
        cout << x << endl;
    }

}

0 个答案:

没有答案