Question

我正在使用n-gram（2克）。我必须从文件中读取和分割单词（超过5000个文件文本），但我的算法要慢。我花了1个小时才完成。你有什么建议或改进它的东西吗？非常感谢。这是我的代码：

void CKhung::duyet_mot_file(string ten_file2) //work with a file
 {
string tien_to = "H:\\\\Visual Studio 2010\\\\Projects\\\\Bai_tap_lon_bai_1\\\\Bai_tap_lon_bai_1\\\\TechCrunch_Files\\\\";
string ten_file = tien_to + ten_file2;   //this is file name
char str[100];
string temp;
string buffer;
string temp1;
int i;
long key;
int sp;  //space
CTu n_gram;       
CDs_noi_don ds_tu;  //list 2-gram of a file
fstream file;
file.open(ten_file);
for (int j = 0; j <3; j++)    /* get through 3 first line*/
    getline(file,temp1,'\n');
while (getline(file,temp,'.'))
{
    sp = -1;
    i = 0;
    if (temp.length() == 0) continue;
    /*Duyet tung cau sau tien xu ly*/
    while (temp.length() != 0)
    {
        while (temp[0] == ' ')
        {
            temp.erase(0,1);
            if (temp.length() ==0) break;
        }
        if (temp.length() == 0) break;
        while (!((temp[0] >='0' && temp[0]<='9')||
            (temp[0] >= 'A' && temp[0] <= 'Z')||
            (temp[0] >= 'a' && temp[0] <= 'z')))
        {
            temp.erase(0,1);
            sp = -1;
            if (temp.length() == 0) break;
        }
        int k;
        if ((k = temp.length()) == 0) break;
        i = 0;
        while ( i < k && ((temp[i] >='0' && temp[i]<='9')||
            (temp[i] >= 'A' && temp[i] <= 'Z')||
            (temp[i] >= 'a' && temp[i] <= 'z')))
        {
            i++;
        }
        temp.copy(str,i,0);
        str[i] = '\0';
        string tu(str);
        temp.erase(0,i);
        if (sp <= 0)
        {
            buffer = tu;
            sp = tu.length();
        }
        else
        {
            buffer = buffer + " " + tu;
            n_gram.set_tu(buffer);
            key = bang_tu->them_phan_tu(n_gram);  //add 2-gram to a hash table
            if (key == -1) continue;
            ds_tu.them_node(key);
            buffer.erase(0,sp+1);
            sp = buffer.length();
        }
    }
}
/*==========================================================*/
/*Sau xong mot van ban, cap nhat so van ban Xuat hien va bang cap tu*/
for (int j = 0; j < ds_tu.get_so_pt(); j++)
    bang_tu->tang_so_vb_xh(ds_tu.truy_xuat_node(j));
/*========================================================================*/
/*Cap nhat bang cac file lien quan*/
CThong_tin_file tt_file(ten_file2,ds_tu); //khi thoat chcon thi ttfile bi giai phong
dsach_file.them_file_vao_bang(tt_file);
}

分词算法，n-gram

0 个答案: