我一直试图制作一个搜索程序来搜索.txt文件中的模式,该文件包含100000个单词。我想查找并打印完全相同的匹配项。我从文件读取并将其传递给大小为125000的char数组(我知道这很糟糕),而 BoyerMooreSearch 函数为我提供了索引(输出的索引[125000 char数组]模式匹配),但我无法执行任何操作。
我在想这可能创建一个链表并仅传递 BoyerMooreSearch 函数创建的模式吗?但是,即使我设法通过.txt中找到的完全匹配项,我如何在Boyer-Moore搜索中找到相似的匹配项?任何帮助将不胜感激。
在研究代码之前,前三个功能只是对Boyer-Moore的预处理,主要搜索功能是 BoyerMooreSearch ,以防止浪费时间。
#include <iostream>
#include <fstream>
#include <string>
#include <chrono> //For time
using namespace std;
#define NO_OF_CHARS 256
void badCharHeuristic(char *str, int size, int badchar[NO_OF_CHARS])
{
int i;
for (i = 0; i < NO_OF_CHARS; i++)
badchar[i] = -1;
for (i = 0; i < size; i++)
badchar[str[i]] = i;
}
void bmPreprocess1(char *pat, int *f, int *s)
{
int m = strlen(pat);
for (int k=0; k < m+1; k++)
s[k] = 0;
int i = m, j = m + 1;
f[i] = j;
while (i > 0)
{
while (j <= m && pat[i-1] != pat[j-1]) {
if (s[j] == 0)
{
s[j] = j - i ;
j = f[j];
}
if (s[j] != 0)
break;
}
i--; j--;
f[i] = j;
}
}
void bmPreprocess2(char *pat, int *f, int *s)
{
int m = strlen(pat);
for (int k=0; k < m+1; k++)
s[k] = 0;
int i, j;
j = f[0];
for (i=0; i <= m; i++) {
if (s[i] == 0)
s[i] = j;
if (i == j)
j = f[j];
}
}
void BoyerMooreSearch(char *txt, char *pat)
{
int m = strlen(pat);
int n = strlen(txt);
int *f = new int[m+1];
int *s = new int[m+1];
int badchar[NO_OF_CHARS];
badCharHeuristic(pat, m, badchar);
bmPreprocess1(pat, f, s);
bmPreprocess2(pat, f, s);
int k = 0; // holds the value of how many shifts we are going to make
while (k <= (n - m))
{
int j = m - 1;
while (j >= 0 && pat[j] == txt[k + j])
j--;
if (j < 0)
{
k += (k + m < n) ? m - badchar[txt[k + m]] : 1;
cout << "pattern occurs at shift = " << k << "\n";
}
else {
k += max(1, j - badchar[txt[k + j]]);
}
}
}
int main()
{
char *p = new char[10];
cin >> p;
ifstream readfile;
readfile.open("englishlinebyline.txt");
char *output = new char[125000];
while (!readfile.eof())
{
for (int i=0; i <= 125000; i++)
readfile >> output[i];
}
clock_t timepassed = clock();
BoyerMooreSearch(output, p);
timepassed = clock() - timepassed;
double timepassed2 = ((double)timepassed / CLOCKS_PER_SEC);
cout << "\nfound in " << timepassed2 << " nanoseconds.\n";
output = NULL;
delete output;
system("pause");
return 0;
}