我有大量的子串列表(超过100000)来计算大字符串(几百kb)中的出现次数。我在互联网上找到的cpp最常见的算法是:
size_t countSubstring(const std::string& str, const std::string& sub) {
if (sub.length() == 0) return 0;
size_t count = 0, l = sub.length();
for (size_t offset = str.find(sub); offset != std::string::npos;
offset = str.find(sub, offset + l))
{
++count;
}
return count;
}
但这对我的目的来说太慢了。有没有更快的方法呢?
P.S。也试过KMP算法,但它甚至更慢。
std::vector<size_t> prefix_function(const std::string& s) {
size_t n = (size_t)s.length();
std::vector<size_t> pi(n);
pi[0] = 0;
size_t j;
for(size_t i=1; i<n; ++i) {
j = pi[i-1];
while(j>0 && s[i]!=s[j])
j = pi[j-1];
if(s[i]==s[j]) ++j;
pi[i] = j;
}
return pi;
}
size_t count_using_KMP(const std::string& S, const std::string& pattern, size_t start) {
std::vector<size_t> prefix = prefix_function(pattern);
size_t counter=0, l=pattern.length(), k=0;
for(size_t i=start; i<S.length(); ++i) {
while((k>0) && (pattern[k]!=S[i])) {
k = prefix[k-1];
}
if(pattern[k]==S[i]) {
k++;
}
if(k==l) {
counter++;
k=0;
}
}
return counter;
}