我的目标是找到文本中任何长度的所有重复字符串(匹配的字符串不应该相交)。为此,我使用以下代码
#include <string>
using namespace std;
int main()
{
string text = "j73vd6hdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j";
int len = text.length();
for(int m=0;m<len-1;m++)
{
int h_len=(len-m)/2;
for(int i=0;i<h_len;i++)
{
string a1 = text.substr(m,i+1);
for(int k=0;k<len-2*i-1-m;k++)
{
string a2 = text.substr(i+1+k+m,i+1);
if(a1==a2) { /* do something */ }
}
}
}
return 0;
}
脚本有效,但是当文本大小增加时,执行时间也会大大增加。该计划太慢了。我怎样才能加快我的计划?你能给我任何改进代码的建议吗?也许有更好的算法来做到这一点。
答案 0 :(得分:2)
不幸的是,我的感觉是没有优化的方式来做这么广泛的搜索类型。您的搜索空间很大,搜索次数也很大。
你基本上都在寻找每个pos / length排列的重复。现有的搜索算法非常适合在大空间内进行单个搜索,因此最多可以帮助您完成算法的一部分。换句话说,您正在进行许多字符串搜索,因此您可以尝试优化每个单字符串搜索。
您仍然可以尝试优化现有算法。例如,您可能会发现使用char*
代替string
可能有所帮助,因为您可以更好地控制状态。这将消除substr
创建不必要的字符串对象的需要。
*编辑:提及如何合并现有的字符串搜索算法。
答案 1 :(得分:2)
您可以将搜索与Boyer-Moore-Search结合使用,或者跟踪常见的前缀序列。
[1] Boyer-Moore
#include <algorithm>
#include <iostream>
#include <limits>
#include <memory>
#include <tuple>
class SkipTable
{
public:
typedef std::size_t size_type;
private:
enum { TableSize = unsigned(std::numeric_limits<unsigned char>::max()) + 1 };
public:
SkipTable()
{
size_type size;
std::tie(m_table, size) = std::get_temporary_buffer<size_type>(TableSize);
if(TableSize <= size) std::fill_n(m_table, TableSize, 1);
else {
std::return_temporary_buffer<size_type>(m_table);
m_table = 0;
}
}
private:
SkipTable(const SkipTable&); // No copy
SkipTable& operator = (const SkipTable&); // No copy
public:
~SkipTable() {
std::return_temporary_buffer<size_type>(m_table);
}
void set(const char* s, size_type len) {
if(len < 1) len = 1;
if(m_table) {
m_len = len;
std::fill_n(m_table, TableSize, m_len);
while(len)
m_table[unsigned(*s++)] = len--;
}
}
const size_type get(size_type pos) const {
size_type n = (m_table && pos < TableSize) ? m_table[pos] : 1;
return n;
}
operator bool() const { return m_table; }
private:
size_type* m_table;
size_type m_len;
};
const char* find(
SkipTable& skip,
const char* str, const std::size_t strlen,
const char* substr, const std::size_t substrlen)
{
typedef std::char_traits<char> traits_type;
typedef std::size_t size_type;
if (substrlen == 0 || strlen < substrlen) return 0;
const char* end = str + strlen - substrlen;
if(skip && 4 < substrlen && substrlen < strlen - substrlen) {
// Boyer-Moore-Search
//===================
skip.set(substr, substrlen);
while(str <= end) {
if(traits_type::compare(str, substr, substrlen) == 0)
break;
str += skip.get(*(str + substrlen));
}
return (str <= end) ? str : 0;
}
else {
// Brute Search
//=============
while(str <= end) {
if (traits_type::compare(str, substr, substrlen) == 0)
return str;
++str;
}
}
return 0;
}
#include <map>
void find_duplicates(const char* s, const std::size_t size) {
std::map<std::string, unsigned> duplicates;
SkipTable skip;
for(std::size_t n = 1; n < size / 2; ++n) {
for(std::size_t i = 0; i <= size - 2*n; ++i) {
const char* p = find(skip, s + i + n, size - i - n, s + i, n);
if(p) {
++duplicates[std::string(p, n)];
}
}
}
// Increment the counts
for(auto& d : duplicates) {
++d.second;
std::cout << '[' << d.second << "] \"" << d.first << "\"\n";
}
}
int main() {
std::string text = "Some text for matching sub-strings in the text.";
find_duplicates(text.c_str(), text.size());
}
注意:如果前缀序列退出,则可以通过映射结果和执行搜索来改进搜索(请参阅下面的代码)。
[2]通用前缀序列
更快的方法:使用公共前缀序列存储指针并匹配前缀序列后面的后缀字符:
void find_duplications(const char* str, const std::size_t size) {
if(size <= 1) return;
struct Pointer {
const char* p;
std::size_t n;
Pointer(const char* p, std::size_t n)
: p(p), n(n)
{}
bool operator < (const Pointer& other) const {
typedef std::char_traits<char> traits_type;
int result = traits_type::compare(p, other.p, std::min(n, other.n));
if(result == 0) return (n < other.n);
else return result < 0;
}
};
typedef std::map<Pointer, unsigned> duplicate_container;
duplicate_container duplicates;
// Prefix
std::vector<const char*> prefix;
for(std::size_t i = 0; i < size - 1; ++i) {
duplicate_container::iterator pos = duplicates.find(Pointer(str+i, 1));
if(pos == duplicates.end()) {
for(std::size_t j = i + 1; j < size ; ++j) {
if(str[i] == str[j]) {
prefix.push_back(str+i);
prefix.push_back(str+j);
pos = duplicates.insert(duplicate_container::value_type(
Pointer(str+i, 1), 2)).first;
for(std::size_t k = j + 1; k < size ; ++k) {
if(str[i] == str[k]) {
prefix.push_back(str+k);
++pos->second;
}
}
// Delimiter
prefix.push_back(0);
break;
}
}
}
}
// Suffix
std::vector<const char*> suffix;
const char* limit = str + size;
std::size_t len = 1;
while( ! prefix.empty()) {
++len;
--limit;
suffix.clear();
for(std::size_t i = 0; i < prefix.size(); ++i) {
const char* p = prefix[i];
if( ! p) continue;
if(limit <= p) break;
duplicate_container::iterator pos = duplicates.find(Pointer(p, len));
if(pos == duplicates.end()) {
for(std::size_t j = i + 1; j < prefix.size(); ++j) {
const char* q = prefix[j];
if( ! q || limit <= q) break;
if(p + len <= q && p[len-1] == q[len-1]) {
suffix.push_back(p);
suffix.push_back(q);
pos = duplicates.insert(duplicate_container::value_type(
Pointer(p, len), 2)).first;
for(std::size_t k = j + 1; k < prefix.size(); ++k) {
q = prefix[k];
if( ! q || limit <= q) break;
if(p[len-1] == q[len-1]) {
suffix.push_back(q);
++pos->second;
}
}
// Delimiter
suffix.push_back(0);
break;
}
}
}
}
prefix.swap(suffix);
}
for(duplicate_container::iterator pos = duplicates.begin(); pos != duplicates.end(); ++pos) {
std::cout
<< '[' << pos->second << "] \""
<< std::string(pos->first.p, pos->first.n) << "\"\n";
}
}
int main() {
std::string text = "Some text for matching sub-strings in the text.";
find_duplications(text.c_str(), text.size());
}
<强>结果 两种算法都会生成相同的结果集,但第二种算法将以第一种方式执行。
[7] " "
[3] " t"
[2] " te"
[2] " tex"
[2] " text"
[4] "e"
[2] "e "
[2] "e t"
[2] "e te"
[2] "e tex"
[2] "e text"
[2] "ex"
[2] "ext"
[2] "g"
[2] "h"
[3] "i"
[3] "in"
[2] "ing"
[2] "m"
[3] "n"
[2] "ng"
[2] "o"
[2] "r"
[3] "s"
[7] "t"
[2] "te"
[2] "tex"
[2] "text"
[2] "x"
[2] "xt"
答案 2 :(得分:1)
我同意tenfour的观点,可能没有一种算法可以让你更快地解决这个问题。下面的代码是你的转换为C,其中AFAICS与我的粗略测量与linux'“时间”,比C ++版本快7倍:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main()
{
int count = 0;
char text[] = "j73vd66hmmsg73flw94nncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93kcjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ndk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94nk9382haswm03hs84mmsg73flw94ncjd93k9djhdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j";
int len = strlen(text);
char* a1 = malloc(len);
char* a2 = malloc(len);
for(int m=0;m<len-1;m++)
{
int h_len=(len-m)/2;
for(int i=0;i<h_len;i++)
{
memcpy(a1, &text[m], i+1);
a1[i+1] = '\0';
for(int k=0;k<len-2*i-1-m;k++)
{
memcpy(a2, &text[i+1+k+m], i+1);
a2[i+1] = '\0';
if(a1[0] == a2[0]) {
count++;
}
}
}
}
printf("Count is: %d\n", count);
return 0;
}
请注意,示例中的文本较长,可以获得一些有意义的运行时间,添加变量计数以便能够打印匹配数,并且在退出之前不进行清理。在我的Ubuntu PC上,C代码需要大约2秒才能完成,其中C ++需要大约14秒。
答案 3 :(得分:1)
您可以通过使用二进制搜索算法进行插入和回溯来加快算法速度。
您可以通过使用std :: set或std :: map来实现。 您要么将每个子字符串存储到集合中(如果/ *执行某些操作* /只需要字符串),要么将子字符串存储在映射中作为键。
那么复杂性将是ln(N)N ^ 2。 (而不是N ^ 3)
例如考虑这个:
#include <string>
#include <map>
int main(int argc, char **argv)
{
std::string text = "j73vd6hdk9382haswm03hs84mmsg73vdflw94ncjd93k9dj3ndi5jf95j";
size_t len = text.length();
std::map<std::string,size_t> table;
std::map<std::string,size_t>::const_iterator myEntry;
for(int m=0;m<len-1;m++)
{
int max_len=len-m;
for(size_t i=0;i<max_len;i++)
{
std::string a1 = text.substr(m,i+1);
myEntry = table.find(a1);
if (myEntry!= table.end()){
if ((myEntry->second + i ) < m)
/*std::cout << a1 << " "; */
}
else
table.insert(std::pair<std::string,size_t>(a1,m));
}
}
return 0;
}
答案 4 :(得分:1)
我在这里填写重复的地图并返回最大值。我已经检查了已经搜索过的内容。这非常重要,因为您不想再次搜索&#34; a&#34;如果你已经做到了例如:
fabfcdfefghijklmnopf
^ ^ ^ ^ ^
我的算法将计算所有重复的&#34; f&#34;第一次打击&#34; f&#34;
fabfcdfefghijklmnopf
^ // count all "f"
然后在索引4处再次找到它时跳过它:
fabfcdfefghijklmnopf
^ // skip as "f" is prersent on map
但它会评估然后&#34; fc&#34;因为这与&#34; fa&#34;不同。算法将搜索&#34; fc&#34;只有&#34; f&#34;被发现是重复的,因为没有机会&#34; abc&#34;如果&#34; ab&#34;不是。
它不会计算重叠的字符串,即:&#34; altoal&#34; in&#34; paltoaltoalm&#34;没有匹配。
#include <iostream>
#include <string>
#include <map>
int findDuplicates( std::string& in, std::map<std::string, int>& m) {
size_t in_s = in.size();
if ( in_s == 0) return (-1);
for ( size_t i = 0; i < in_s; i++) {
size_t pos_beg = i;
size_t pos_end = pos_beg + 1;
while ( pos_end < in_s) {
std::string searched = in.substr( pos_beg, pos_end - pos_beg);
if( m.find( searched) != m.end()) {
++pos_end;
continue;
}
bool present = false;
size_t found;
while( ( found = in.substr( pos_end).find( searched)) != std::string::npos) {
present = true;
m[ searched]++;
pos_end = pos_end + found + searched.size();
}
if( !present)
break;
else
pos_end = pos_beg + searched.size() + 1;
}
}
int max = 0;
for (std::map<std::string, int>::const_iterator it = m.begin(); it != m.end(); ++it) {
if (it ->second > max) {
max = it->second;
}
}
return max;
}
用法:
/*
*
*/
int main(int argc, char** argv) {
std::string in( "j73vd6hdk9382haswm03hs84mmsg73flw94ncjd93k9dj3ndi5jf95j");
std::map<std::string, int> duplicates;
int rc = findDuplicates( in, duplicates);
std::map<std::string, int>::iterator it = duplicates.begin();
while( it != duplicates.end()) {
std::cout << (*it).first << "," << it->second << std::endl;
++it;
}
return 0;
}
输出:
3,5
4,1
5,1
5J,1
7,1
73,1
8,1
9,4
93,1
d,4
F,1-
H,2
Ĵ,4
K,1
k9,1
米,2