给定无限字符流和字符串列表L,创建一个函数,在处理流时识别L中的单词时调用外部API。
实施例: L = [“ok”,“test”,“one”,“try”,“trying”]
stream = a,b,c,o,k,d,e,f,t,r,y,i,n,g .............
当遇到'k'时,会发生对外部API的调用,遇到'y'时再次调用,再次出现'g'。
我的想法: 在线性时间内从流中读取时,从列表中创建trie并导航节点。但是如果你只是做一个简单的特里search就会有一个错误。
假设你有单词“abxyz”和“xyw”,你的输入是“abxyw”。在这种情况下,你无法用trie识别“xyw”。
所以搜索应该修改如下:
让我们以上用例“abxyw”。我们开始搜索,我们发现我们拥有直到'x'的所有元素。你得到'x'的时刻你有两个选择:
检查当前元素是否等于trie的头部,如果它等于trie的头部,则调用递归搜索。
继续直到当前单词的结尾。在这种情况下,对于您的给定输入,它将返回false,但对于我们从第1点开始的递归搜索,它将返回true。
以下是我修改后的搜索,但我认为它有错误,可以改进。有什么建议吗?
#define SIZE 26
struct tri{
int complete;
struct tri *child[SIZE];
};
void insert(char *c, struct tri **t)
{
struct tri *current = *t;
while(*c != '\0')
{
int i;
int letter = *c - 'a';
if(current->child[letter] == NULL) {
current->child[letter] = malloc(sizeof(*current));
memset(current->child[letter], 0, sizeof(struct tri));
}
current = current->child[letter];
c++;
}
current->complete = 1;
}
struct tri *t;
int flag = 0;
int found(char *c, struct tri *tt)
{
struct tri *current = tt;
if (current == NULL)
return 0;
while(*c != '\0')
{
int i;
int letter = *c - 'a';
/* if this is the first char then recurse from begining*/
if (t->child[letter] != NULL)
flag = found(c+1, t->child[letter]);
if (flag == 1)
return 1;
if(!flag && current->child[letter] == NULL) {
return 0;
}
current = current->child[letter];
c++;
}
return current->complete;
}
int main()
{
int i;
t = malloc(sizeof(*t));
t->complete = 0;
memset(t, 0, sizeof(struct tri));
insert("weathez", &t);
insert("eather", &t);
insert("weather", &t);
(1 ==found("weather", t))?printf("found\n"):printf("not found\n");
return 0;
}
答案 0 :(得分:1)
您想要做的事情正是Aho-Corasick algorithm所做的。
你可以看看我的Aho-Corasick实现。它是以比赛为导向的,所以可能没有专注于可读性,但我认为很清楚:
typedef vector<int> VI;
struct Node {
int size;
Node *fail, *output;
VI id;
map<char, Node*> next;
};
typedef pair<Node*, Node*> P;
typedef map<char, Node*> MCP;
Node* root;
inline void init() {
root = new Node;
root->size = 0;
root->output = root->fail = NULL;
}
Node* add(string& s, int u, int c = 0, Node* p = root) {
if (p == NULL) {
p = new Node;
p->size = c;
p->fail = p->output = NULL;
}
if (c == s.size()) p->id.push_back(u);
else {
if (not p->next.count(s[c])) p->next[s[c]] = NULL;
p->next[s[c]] = add(s, u, c + 1, p->next[s[c]]);
}
return p;
}
void fill_fail_output() {
queue<pair<char, P> > Q;
for (MCP::iterator it=root->next.begin();
it!=root->next.end();++it)
Q.push(pair<char, P> (it->first, P(root, it->second)));
while (not Q.empty()) {
Node *pare = Q.front().second.first;
Node *fill = Q.front().second.second;
char c = Q.front().first; Q.pop();
while (pare != root && !pare->fail->next.count(c))
pare=pare->fail;
if (pare == root) fill->fail = root;
else fill->fail = pare->fail->next[c];
if (fill->fail->id.size() != 0)
fill->output = fill->fail;
else fill->output = fill->fail->output;
for (MCP::iterator it=fill->next.begin();
it!=fill->next.end();++it)
Q.push(pair<char,P>(it->first,P(fill,it->second)));
}
}
void match(int c, VI& id) {
for (int i = 0; i < id.size(); ++i) {
cout << "Matching of pattern " << id[i];
cout << " ended at " << c << endl;
}
}
void search(string& s) {
int i = 0, j = 0;
Node *p = root, *q;
while (j < s.size()) {
while (p->next.count(s[j])) {
p = p->next[s[j++]];
if (p->id.size() != 0) match(j - 1, p->id);
q = p->output;
while (q != NULL) {
match(j - 1, q->id);
q = q->output;
}
}
if (p != root) {
p = p->fail;
i = j - p->size;
}
else i = ++j;
}
}
void erase(Node* p = root) {
for (MCP::iterator it = p->next.begin();
it != p->next.end(); ++it)
erase(it->second);
delete p;
}
int main() {
init();
int n;
cin >> n;
for (int i = 0; i < n; ++i) {
string s;
cin >> s;
add(s, i);
}
fill_fail_output();
string text;
cin >> text;
search(text);
erase(root);
}