Question

我已经制作了一个程序，可以输出txt文件中最常用的单词。有没有人知道如何优化它，它会更大文件更快。

输出图像：http://i.stack.imgur.com/fVBh0.png

这是代码。

#include <iostream>
#include <string>
#include <fstream>
#include <cstdlib>
#include <vector>
#include <algorithm>
#include <math.h>

using namespace std;

int main()
{
    ifstream in("file.txt");

    if(!in){
        cerr << "Could not open file.txt.";
        return EXIT_FAILURE;
    }

    string str, str2, strn, tab[10000], tab2[10000];
    int i, k, j, n, l, tabl;
    char c = 179;
    vector<int> tabs;
    vector<string> stringi;

    while(getline(in, str2)){
        str += str2;
        str += ' ';
    }
    k = 0;
    for(i = 0; i < str.length(); i++){
        if(str[i] != ' ' && str[i] != '.' && str[i] != '\t' && str[i] != ','
           && str[i] != ';' && str[i] != ':' && str[i] != '}' && str[i] != '{'){
            tab[k] += tolower(str[i]);
        }else{
            k++;
        }
        if(str[i] == '.' || str[i] == '\t' || str[i] == ',' || str[i] == ';'
        || str[i] == ':' || str[i] == '}' || str[i] == '{') {
            k--;
        }
    }
    tabl = k;

    k = 0;
    for(i = 0; i < tabl; i++){
        for(j = 0; j < tabl; j++){
            if(tab[i] == tab[j]){
                k++;
            }
        }
        tabs.push_back(k);
        k = 0;
    }
    for(i = 0; i < tabl; i++){
        for(j = 0; j < tabl-1; j++){
            if(tab[j] < tab[j+1]){
                n = tabs.at(j);
                tabs.at(j) = tabs.at(j+1);
                tabs.at(j+1) = n;
                strn = tab[j];
                tab[j] = tab[j+1];
                tab[j+1] = strn;
            }
        }
    }
    for(i = 0; i < tabl; i++){
        for(j = 0; j < tabl-1; j++){
            if(tabs.at(j) < tabs.at(j+1)){
                n = tabs.at(j);
                tabs.at(j) = tabs.at(j+1);
                tabs.at(j+1) = n;
                strn = tab[j];
                tab[j] = tab[j+1];
                tab[j+1] = strn;
            }
        }
    }
    tab2[0] = tab[0];
    for(i = 0; i < tabl; i++){
        if(tab[i] != tab[i+1]){
            tab2[i] = tab[i+1];
        }
    }
    k = 1;
    l++;
    for(i = 0; i < tabl; i++){
        if(!tab2[i].empty()){
            l++;
        }
    }
    cout << "------------------------------------" << endl;
    cout << "|--->TABLE OF MOST COMMON WORDS<---|" << endl;
    cout << "------------------------------------" << endl;
    for(i = 0; i < tabl; i++){
        if(!tab2[i].empty() && k <= 20 ){
            cout << c << k++ << "." << '\t' << c << tab2[i] << '\t' << c << "*" <<
            tabs.at(i+1)
            << '\t'  << c << roundf(((float)tabs.at(i+1)*100/l)*100)/100 << "%" <<
            endl;
        }
    }
    cout << "------------------------------------" << endl ;
    cout << "|----->Dif. strings: " << '\t' << l << "<-------|" << endl ;
    cout << "------------------------------------" << endl;

    return 0;
}

Answer 1

由于这并不是一次尝试读取整个文件，因此文件大小的唯一上限是您愿意等待输出的时间。

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <map>
#include <algorithm>

char convert(char arg)
{
    if (arg == '.' || arg == '\t' || arg == ',' || arg == ';'
            || arg == ':' || arg == '}' || arg == '{')
    {
        return ' ';     // convert delimiters to spaces
    }
    return(tolower(arg));  // convert everything else to lower case
}

int main()
{
    std::ifstream in("c:\\etc\\foo.txt");

    // This can be replaced with std::unordered_map if you are willing to sacrifice
    // lexical sorting of the output for speed
    std::map<std::string, int> counts;

    while (!in.eof())
    {
        std::string str;
        // Get the next space delimited word from the file
        in >> str;

        if (str == "")
        {
            // Reject empty strings, which can occur as we read the file
            continue;
        }

        // Convert to lower case, and convert our delimiter set to spaces
        std::transform(str.begin(), str.end(), str.begin(), convert);
        std::stringstream in1(str);

        while (!in1.eof())
        {
            std::string word;
            in1 >> word;
            if (word == "")
            {
                // reject empty words, which are also possible at this point
                continue;
            }
            // Use the map to count occurrences of the word
            auto it = counts.find(word);
            if (it == counts.end())
            {
                counts[word] = 1;
            }
            else
            {
                it->second++;
            }
        }
    }

    // Output the results here

    return 0;
}

txt文件中最常见的单词

1 个答案: