我正在使用n-gram(2克)。我必须从文件中读取和分割单词(超过5000个文件文本),但我的算法要慢。我花了1个小时才完成。你有什么建议或改进它的东西吗?非常感谢。这是我的代码:
void CKhung::duyet_mot_file(string ten_file2) //work with a file
{
string tien_to = "H:\\\\Visual Studio 2010\\\\Projects\\\\Bai_tap_lon_bai_1\\\\Bai_tap_lon_bai_1\\\\TechCrunch_Files\\\\";
string ten_file = tien_to + ten_file2; //this is file name
char str[100];
string temp;
string buffer;
string temp1;
int i;
long key;
int sp; //space
CTu n_gram;
CDs_noi_don ds_tu; //list 2-gram of a file
fstream file;
file.open(ten_file);
for (int j = 0; j <3; j++) /* get through 3 first line*/
getline(file,temp1,'\n');
while (getline(file,temp,'.'))
{
sp = -1;
i = 0;
if (temp.length() == 0) continue;
/*Duyet tung cau sau tien xu ly*/
while (temp.length() != 0)
{
while (temp[0] == ' ')
{
temp.erase(0,1);
if (temp.length() ==0) break;
}
if (temp.length() == 0) break;
while (!((temp[0] >='0' && temp[0]<='9')||
(temp[0] >= 'A' && temp[0] <= 'Z')||
(temp[0] >= 'a' && temp[0] <= 'z')))
{
temp.erase(0,1);
sp = -1;
if (temp.length() == 0) break;
}
int k;
if ((k = temp.length()) == 0) break;
i = 0;
while ( i < k && ((temp[i] >='0' && temp[i]<='9')||
(temp[i] >= 'A' && temp[i] <= 'Z')||
(temp[i] >= 'a' && temp[i] <= 'z')))
{
i++;
}
temp.copy(str,i,0);
str[i] = '\0';
string tu(str);
temp.erase(0,i);
if (sp <= 0)
{
buffer = tu;
sp = tu.length();
}
else
{
buffer = buffer + " " + tu;
n_gram.set_tu(buffer);
key = bang_tu->them_phan_tu(n_gram); //add 2-gram to a hash table
if (key == -1) continue;
ds_tu.them_node(key);
buffer.erase(0,sp+1);
sp = buffer.length();
}
}
}
/*==========================================================*/
/*Sau xong mot van ban, cap nhat so van ban Xuat hien va bang cap tu*/
for (int j = 0; j < ds_tu.get_so_pt(); j++)
bang_tu->tang_so_vb_xh(ds_tu.truy_xuat_node(j));
/*========================================================================*/
/*Cap nhat bang cac file lien quan*/
CThong_tin_file tt_file(ten_file2,ds_tu); //khi thoat chcon thi ttfile bi giai phong
dsach_file.them_file_vao_bang(tt_file);
}