C ++ - 以大文本搜索重复的字符串

时间:2014-04-17 10:27:51

标签: c++ string search text

我的目标是找到文本中任何长度的所有重复字符串(匹配的字符串不应该相交)。为此,我使用以下代码

   #include <string>
   using namespace std;

   int main()
   {
      string text = "j73vd6hdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j";
      int len =  text.length();

         for(int m=0;m<len-1;m++)
         {
           int h_len=(len-m)/2;

           for(int i=0;i<h_len;i++)
           {
              string a1 = text.substr(m,i+1);
              for(int k=0;k<len-2*i-1-m;k++)
              {
                  string a2 = text.substr(i+1+k+m,i+1);
                  if(a1==a2) { /* do something */ }
              }

           }
         }

     return 0;     
   }    

脚本有效,但是当文本大小增加时,执行时间也会大大增加。该计划太慢了。我怎样才能加快我的计划?你能给我任何改进代码的建议吗?也许有更好的算法来做到这一点。

5 个答案:

答案 0 :(得分:2)

不幸的是,我的感觉是没有优化的方式来做这么广泛的搜索类型。您的搜索空间很大,搜索次数也很大。

你基本上都在寻找每个pos / length排列的重复。现有的搜索算法非常适合在大空间内进行单个搜索,因此最多可以帮助您完成算法的一部分。换句话说,您正在进行许多字符串搜索,因此您可以尝试优化每个单字符串搜索。

您仍然可以尝试优化现有算法。例如,您可能会发现使用char*代替string可能有所帮助,因为您可以更好地控制状态。这将消除substr创建不必要的字符串对象的需要。

*编辑:提及如何合并现有的字符串搜索算法。

答案 1 :(得分:2)

您可以将搜索与Boyer-Moore-Search结合使用,或者跟踪常见的前缀序列。

[1] Boyer-Moore

#include <algorithm>
#include <iostream>
#include <limits>
#include <memory>
#include <tuple>

class SkipTable
{
    public:
    typedef std::size_t size_type;

    private:
    enum { TableSize = unsigned(std::numeric_limits<unsigned char>::max()) + 1 };

    public:
    SkipTable()
    {
        size_type size;
        std::tie(m_table, size) = std::get_temporary_buffer<size_type>(TableSize);
        if(TableSize <= size) std::fill_n(m_table, TableSize, 1);
        else {
            std::return_temporary_buffer<size_type>(m_table);
            m_table = 0;
        }
    }

    private:
    SkipTable(const SkipTable&); // No copy
    SkipTable& operator = (const SkipTable&); // No copy

    public:
    ~SkipTable() {
        std::return_temporary_buffer<size_type>(m_table);
    }

    void set(const char* s, size_type len) {
        if(len < 1) len = 1;
        if(m_table) {
            m_len = len;
            std::fill_n(m_table, TableSize, m_len);
            while(len)
                m_table[unsigned(*s++)] = len--;
        }
    }

    const size_type get(size_type pos) const {
        size_type n = (m_table && pos < TableSize) ? m_table[pos] : 1;
        return n;
    }

    operator bool() const { return m_table; }

    private:
    size_type* m_table;
    size_type m_len;
};


const char* find(
    SkipTable& skip,
    const char* str, const std::size_t strlen,
    const char* substr, const std::size_t substrlen)
{
    typedef std::char_traits<char> traits_type;
    typedef std::size_t size_type;

    if (substrlen == 0 || strlen < substrlen) return 0;

    const char* end = str + strlen - substrlen;

    if(skip && 4 < substrlen && substrlen < strlen - substrlen) {

        // Boyer-Moore-Search
        //===================


        skip.set(substr, substrlen);
        while(str <= end) {
            if(traits_type::compare(str, substr, substrlen) == 0)
                break;
            str += skip.get(*(str + substrlen));
        }
        return (str <= end) ? str : 0;
    }
    else {
        // Brute Search
        //=============

        while(str <= end) {
            if (traits_type::compare(str, substr, substrlen) == 0)
                return str;
            ++str;
        }
    }
    return 0;
}


#include <map>

void find_duplicates(const char* s, const std::size_t size) {
    std::map<std::string, unsigned> duplicates;
    SkipTable skip;
    for(std::size_t n = 1; n < size / 2; ++n) {
        for(std::size_t i = 0; i <= size - 2*n; ++i) {
            const char* p = find(skip, s + i + n, size - i - n, s + i, n);
            if(p) {
                ++duplicates[std::string(p, n)];
            }
        }
    }
    // Increment the counts
    for(auto& d : duplicates) {
        ++d.second;
        std::cout << '[' << d.second << "] \"" << d.first << "\"\n";
    }
}

int main() {
    std::string text = "Some text for matching sub-strings in the text.";
    find_duplicates(text.c_str(), text.size());
}

注意:如果前缀序列退出,则可以通过映射结果和执行搜索来改进搜索(请参阅下面的代码)。

[2]通用前缀序列

更快的方法:使用公共前缀序列存储指针并匹配前缀序列后面的后缀字符:

void find_duplications(const char* str, const std::size_t size) {

    if(size <= 1) return;

    struct Pointer {
        const char* p;
        std::size_t n;

        Pointer(const char* p, std::size_t n)
        :   p(p), n(n)
        {}

        bool operator < (const Pointer& other) const {
            typedef std::char_traits<char> traits_type;
            int result = traits_type::compare(p, other.p, std::min(n, other.n));
            if(result == 0) return (n < other.n);
            else return result < 0;
        }
    };

    typedef std::map<Pointer, unsigned> duplicate_container;
    duplicate_container duplicates;

    // Prefix
    std::vector<const char*> prefix;
    for(std::size_t i = 0; i < size - 1; ++i) {
        duplicate_container::iterator pos = duplicates.find(Pointer(str+i, 1));
        if(pos == duplicates.end()) {
            for(std::size_t j = i + 1; j < size ; ++j) {
                if(str[i] == str[j]) {
                    prefix.push_back(str+i);
                    prefix.push_back(str+j);
                    pos = duplicates.insert(duplicate_container::value_type(
                        Pointer(str+i, 1), 2)).first;
                    for(std::size_t k = j + 1; k < size ; ++k) {
                        if(str[i] == str[k]) {
                            prefix.push_back(str+k);
                            ++pos->second;
                        }
                    }
                    // Delimiter
                    prefix.push_back(0);
                    break;
                }
            }
        }
    }

    // Suffix
    std::vector<const char*> suffix;
    const char* limit = str + size;
    std::size_t len = 1;
    while( ! prefix.empty()) {
        ++len;
        --limit;
        suffix.clear();
        for(std::size_t i = 0; i < prefix.size(); ++i) {
            const char* p = prefix[i];
            if( ! p) continue;
            if(limit <= p) break;
            duplicate_container::iterator pos = duplicates.find(Pointer(p, len));
            if(pos == duplicates.end()) {
                for(std::size_t j = i + 1; j < prefix.size(); ++j) {
                    const char* q = prefix[j];
                    if( ! q || limit <= q) break;
                    if(p + len <= q && p[len-1] == q[len-1]) {
                        suffix.push_back(p);
                        suffix.push_back(q);
                        pos = duplicates.insert(duplicate_container::value_type(
                            Pointer(p, len), 2)).first;
                        for(std::size_t k = j + 1; k < prefix.size(); ++k) {
                            q = prefix[k];
                            if( ! q || limit <= q) break;
                            if(p[len-1] == q[len-1]) {
                                suffix.push_back(q);
                                ++pos->second;
                            }
                        }
                        // Delimiter
                        suffix.push_back(0);
                        break;
                    }
                }
            }
        }
        prefix.swap(suffix);
    }

    for(duplicate_container::iterator pos = duplicates.begin(); pos != duplicates.end(); ++pos) {
        std::cout
            << '[' << pos->second << "] \""
            << std::string(pos->first.p, pos->first.n) << "\"\n";
    }
}

int main() {
    std::string text = "Some text for matching sub-strings in the text.";
    find_duplications(text.c_str(), text.size());
}

<强>结果 两种算法都会生成相同的结果集,但第二种算法将以第一种方式执行。

[7] " "
[3] " t"
[2] " te"
[2] " tex"
[2] " text"
[4] "e"
[2] "e "
[2] "e t"
[2] "e te"
[2] "e tex"
[2] "e text"
[2] "ex"
[2] "ext"
[2] "g"
[2] "h"
[3] "i"
[3] "in"
[2] "ing"
[2] "m"
[3] "n"
[2] "ng"
[2] "o"
[2] "r"
[3] "s"
[7] "t"
[2] "te"
[2] "tex"
[2] "text"
[2] "x"
[2] "xt"

答案 2 :(得分:1)

我同意tenfour的观点,可能没有一种算法可以让你更快地解决这个问题。下面的代码是你的转换为C,其中AFAICS与我的粗略测量与linux'“时间”,比C ++版本快7倍:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main()
{
        int count = 0;
        char text[] = "j73vd66hmmsg73flw94nncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kcjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94nk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j";
        int len = strlen(text);
        char* a1 = malloc(len);
        char* a2 = malloc(len);

        for(int m=0;m<len-1;m++)
        {
                int h_len=(len-m)/2;

                for(int i=0;i<h_len;i++)
                {
                        memcpy(a1, &text[m], i+1);
                        a1[i+1] = '\0';
                        for(int k=0;k<len-2*i-1-m;k++)
                        {
                                memcpy(a2, &text[i+1+k+m], i+1);
                                a2[i+1] = '\0';
                                if(a1[0] == a2[0]) { 
                                        count++;
                                }
                        }

                }
        }
        printf("Count is: %d\n", count);

        return 0;     
}    

请注意,示例中的文本较长,可以获得一些有意义的运行时间,添加变量计数以便能够打印匹配数,并且在退出之前不进行清理。在我的Ubuntu PC上,C代码需要大约2秒才能完成,其中C ++需要大约14秒。

答案 3 :(得分:1)

您可以通过使用二进制搜索算法进行插入和回溯来加快算法速度。

您可以通过使用std :: set或std :: map来实现。 您要么将每个子字符串存储到集合中(如果/ *执行某些操作* /只需要字符串),要么将子字符串存储在映射中作为键。

那么复杂性将是ln(N)N ^ 2。 (而不是N ^ 3)

例如考虑这个:

#include <string>
#include <map>
int main(int argc, char **argv)
{
  std::string text = "j73vd6hdk9382haswm03hs84mmsg73vdflw94ncjd93k9dj3ndi5jf95j";
  size_t len =  text.length();

  std::map<std::string,size_t> table; 
  std::map<std::string,size_t>::const_iterator myEntry; 
  for(int m=0;m<len-1;m++)
  {
    int max_len=len-m;
    for(size_t i=0;i<max_len;i++)
    {
       std::string a1 = text.substr(m,i+1);
       myEntry = table.find(a1); 
       if (myEntry!= table.end()){
        if ((myEntry->second + i ) < m)         
         /*std::cout << a1 << " "; */
       }
       else 
          table.insert(std::pair<std::string,size_t>(a1,m));
    }
  }
  return 0;
}

答案 4 :(得分:1)

我在这里填写重复的地图并返回最大值。我已经检查了已经搜索过的内容。这非常重要,因为您不想再次搜索&#34; a&#34;如果你已经做到了例如:

fabfcdfefghijklmnopf
^  ^  ^ ^          ^

我的算法将计算所有重复的&#34; f&#34;第一次打击&#34; f&#34;

fabfcdfefghijklmnopf
^ // count all "f"

然后在索引4处再次找到它时跳过它:

fabfcdfefghijklmnopf
   ^ // skip as "f" is prersent on map

但它会评估然后&#34; fc&#34;因为这与&#34; fa&#34;不同。算法将搜索&#34; fc&#34;只有&#34; f&#34;被发现是重复的,因为没有机会&#34; abc&#34;如果&#34; ab&#34;不是。

它不会计算重叠的字符串,即:&#34; altoal&#34; in&#34; paltoaltoalm&#34;没有匹配。

#include <iostream>
#include <string>
#include <map>

int findDuplicates( std::string& in, std::map<std::string, int>& m) {
    size_t in_s = in.size();
    if ( in_s == 0) return (-1);

    for ( size_t i = 0; i < in_s; i++) {
        size_t pos_beg = i;
        size_t pos_end = pos_beg + 1;
        while ( pos_end < in_s) {
            std::string searched =  in.substr( pos_beg, pos_end - pos_beg);
            if( m.find( searched) != m.end()) {
                ++pos_end;
                continue;
            }
            bool present = false;
            size_t found;
            while( ( found = in.substr( pos_end).find( searched)) != std::string::npos) {
                present = true;
                m[ searched]++;
                pos_end = pos_end + found + searched.size();
            }
            if( !present) 
                break;
            else
            pos_end = pos_beg + searched.size() + 1;
        }
    }
    int max = 0;
    for (std::map<std::string, int>::const_iterator it = m.begin(); it != m.end(); ++it) {
        if (it ->second > max) {
            max = it->second;
        }
    }
    return max;
}

用法:

/*
 * 
 */
int main(int argc, char** argv) {

    std::string in( "j73vd6hdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j");
    std::map<std::string, int> duplicates;
    int rc = findDuplicates( in, duplicates);

    std::map<std::string, int>::iterator it = duplicates.begin();
    while( it != duplicates.end()) {
        std::cout << (*it).first << "," << it->second << std::endl;
        ++it;
    }
    return 0;
}

输出:

3,5

4,1

5,1

5J,1

7,1

73,1

8,1

9,4

93,1

d,4

F,1-

H,2

Ĵ,4

K,1

k9,1

米,2