Question

我需要编写读取文本文件的代码，并输出单词的数量，不同单词的数量以及C中最常用的单词。

我已经完成了输出单词数量的代码，但我不知道如何找到不同单词的数量或最常用的单词。我知道我应该使用strcmp，但我不知道这样做。任何帮助将不胜感激。这是我到目前为止所拥有的。

int main(int argc, char *argv[])
{
    int number=0;
    char temp[25000][50];
    char word[25000][50];
    char *word2[25000][50];
    int wordCount=0;
    int distinctCount=0;
    FILE *fp;

    //reads file!
    fp = fopen("COEN12_LAB1.txt", "r");

    if(fp == NULL)
    {
        printf("File Missing!\n");
        return 0;
    }

    //counts words in file!
    while(fscanf(fp,"%s", word) == 1)
        wordCount++;

    printf("Total number of words: %d\n", wordCount);
    fclose(fp);`
}

Answer 1

首先，您可能需要实现允许您有效保留不同单词的结构。 Hash table是可能的（可能是最好的）之一。

以下是C：

上哈希的实现和使用示例

此外，您可以查看此问题：Porting std::map to C?

Answer 2

我为你编写了程序，请参阅此处的来源：http://olegh.cc.st/src/words.c.txt 当然，没有检查特殊情况，就像单行上的很多单词一样，不同的词数量＆gt; 16,000等等。但是，您可以获得基本代码：

运行示例：

$ cat aaa.txt
aaa
bbb
ccc
aaa
xxx
aaa
cc

$ cc words.c ; ./a.out aaa.txt
   1    xxx
   1    ccc
   1    bbb
   1    cc
   3    aaa

Answer 3

的 [编辑]
1。用calloc替换malloc（将内存初始化为0）
2。替换了qsort中的第二个参数
3。程序现在适用于更广泛的文件（更多单词，更多分隔符）

这不是很好，可能需要一些小的调试，但它会让你开始计算，不同和最常用词的数量：

#include <ansi_c.h> #include <stdio.h> #define FILENAME "c:\\dev\\play\\test3.txt" //put your own path here #define DELIM "- .,:;//_*&\n" int longestWord(char *file, int *cnt); void allocMemory(int numStrings, int max); void freeMemory(int numStrings); static int sortstring( const void *str1, const void *str2 ); char **strings; int main() { int wc, longest, cnt, distinct, i, mostFreq, mostFreqKeep=0; char line[260]; char *buf=0; FILE *fp; longest = longestWord(FILENAME, &wc); char wordKeep[longest]; allocMemory(wc, longest); //read file into string arrays fp = fopen(FILENAME, "r"); cnt=0; while(fgets(line, 260, fp)) { buf = strtok(line, DELIM); while(buf) { if((strlen(buf) > 0) && (buf[0] != '\t') && (buf[0] != '\n') && (buf[0] != '\0')&& (buf[0] > 0)) { strcpy(strings[cnt], buf); cnt++; //use as accurate count of words. } buf = strtok(NULL, DELIM); } } fclose(fp); //now get most frequent word //sort qsort(strings, cnt, sizeof(char*), sortstring); distinct = 1; mostFreq = 1; //every word will occur once wordKeep[0]=0; for(i=0;i<cnt-1;i++) { //depends on a successful sort (alphabetization) if(strlen(strings[i]) >0) { if(strcmp(strings[i], strings[i+1]) == 0) { mostFreq++; if(mostFreq > mostFreqKeep) { strcpy(wordKeep, strings[i]); mostFreqKeep = mostFreq; } } else { mostFreq = 1; distinct++; } } } printf("number of words: %d\nNumber of distinct words:%d\nmost frequent word: %s - %d\n", cnt, distinct, wordKeep, mostFreqKeep); freeMemory(cnt); getchar(); return 0; } int longestWord(char *file, int *nWords) { FILE *fp; int cnt=0, longest=0, numWords=0; char c; fp = fopen(file, "r"); while ( (c = fgetc ( fp) ) != EOF ) { if ( isalpha ( c ) ) cnt++; else if ( ( ispunct ( c ) ) || ( isspace ( c ) ) ) { (cnt > longest) ? (longest = cnt, cnt=0) : (cnt=0); numWords++; } } *nWords = numWords; fclose(fp); return longest+1; } void allocMemory(int numStrings, int max) { int i; strings = calloc(sizeof(char*)*(numStrings+1), sizeof(char*)); for(i=0;i<numStrings; i++) { strings[i] = calloc(sizeof(char)*max + 1, sizeof(char)); } } void freeMemory(int numStrings) { int i; for(i=0;i<numStrings; i++) if(strings[i]) free(strings[i]); free(strings); } static int sortstring( const void *str1, const void *str2 ) { const char *rec1 = *(const char**)str1; const char *rec2 = *(const char**)str2; int val = strcmp(rec1, rec2); return val; }

Answer 4

您可以使用简单的数据库来计算输入文本中的不同字数。为简单起见，我建议使用SQLite。下面我添加了一些示例代码（为了简洁起见，我省略了错误处理）。

为了阅读单词，我采用了一种方法，使用fgets将一行读入缓冲区。我注意到这种方法可以很好地工作，只要你能保证缓冲区总是足够大以容纳输入文件中的实际行。否则，单词将在缓冲区的末尾分割，需要以某种方式处理。

为了解析我使用strtok的文本。在实施过程中，我了解到很难让单词分隔符正确。除此之外，这种方法完全忽略了可能的拼写差异（例如，大写）和其他相同单词的inflections，因此可能会对结果产生负面影响。

一旦数据在数据库中，查询语言就非常适合于制定查询以获得最大（不同）字数或字频率。因此，当您想要从输入文本计算多个统计信息时，我认为这种灵活的方法具有优势，因为您不必在C中实现每个特殊情况。为了进行测试，我将part of the Wikipedia article on SQLite复制到文件中{ {1}}。

以下是示例：

words.txt

这是我的输出：

#include <sqlite3.h>
#include <stdio.h>
#include <string.h>

#define DELIM " \r\n\t,.-;:_#+*\\=)(/&%$§\"“”!1234567890}][{'"
#define BUFSIZE 4096
#define SQLSIZE 256

int print_row(void* p, int ncols, char **values, char **names) {
    for (int i = 0; i < ncols; i++)
        printf("| %15s : %15s %s", names[i], values[i], i<ncols-1?"":"|\n");
    return 0;
}

int main(int argc, char * argv[]) {
    /* open infile */
    FILE * infile = fopen("words.txt", "r");
    /* initialize database */
    sqlite3 *db_handle = 0;
    sqlite3_open(":memory:", &db_handle);
    sqlite3_exec(db_handle, "CREATE TABLE word (word TEXT);", 0, 0, 0);
    /* parse file, populate db */
    char buf[BUFSIZE], sql[SQLSIZE], *word;
    while(fgets(buf, BUFSIZE, infile))
        for (word = strtok(buf, DELIM); word != 0; word = strtok(0, DELIM)) {
            snprintf(sql, SQLSIZE, "INSERT INTO word VALUES ('%s');", word);
            sqlite3_exec(db_handle, sql, 0, 0, 0);
        }
    /* count of words */
    sqlite3_exec(db_handle,
                 "SELECT COUNT(word) AS total_words FROM word;",
                 print_row, 0, 0);
    /* count of distinct words */
    sqlite3_exec(db_handle,
                 "SELECT COUNT(DISTINCT word) AS distinct_words FROM word;",
                 print_row, 0, 0);
    /* top five most frequent words */
    sqlite3_exec(db_handle,
                 "SELECT word, COUNT(*) AS count FROM word "
                 "GROUP BY word ORDER BY count DESC LIMIT 5;",
                 print_row, 0, 0);
    sqlite3_close(db_handle);
}

供参考：

在C代码中创建，输出单词数，不同单词数和最常用单词

4 个答案: