Question

我试图查找文本中最常用的单词。在我的程序中，判断系统（PC ^ 2）将输入一些单词，分隔符和一些文本。单词和文本由＆＃34; -----＆＃34;分开。（我需要在我的程序中搜索最常用的单词）。

输入

apple
an
banana
-----
i like an apple.
i also like apple-pie.
i like banana
i like banana+ice.

输出

apple 2
banana 2

不是数字，字母和＆＃39; _＆＃39;应该被视为空间
每行的最长长度为1024
输出顺序应为strcmp顺序

然而，虽然该程序在我测试时运行良好。我从PC ^ 2获得超时限制（TLE）。因此，我想问一下如何才能提高程序的时间复杂度？在显示最频繁的单词及其频率之前，我甚至得到了TLE。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char tmp[2048];
char **dicWord;
int *dicWcount;
int dic_assume_num = 1000, dic_actual_num = 0;

int main()
{
    char divider[6] = "-----";
    ssize_t bytes_read;
    size_t nbytes = 2047;
    char *my_string;

    // first, get the input of words
    dicWord = malloc( dic_assume_num * sizeof( *dicWord ));
    my_string = malloc((nbytes + 1) * sizeof(*my_string));

    while ((bytes_read = getline (&my_string, &nbytes, stdin)) != -1) {
        my_string[strcspn(my_string, "\n")] = '\0';

        if (strncmp(my_string, divider, 5) == 0) {
            dicWcount = calloc(dic_actual_num+1,  sizeof(*dicWcount));
            break;
        }

        else {
            dicWord[dic_actual_num++] = strdup(my_string);
            if (dic_actual_num >= dic_assume_num) {
                dic_assume_num *= 2;
                dicWord = realloc( dicWord, dic_assume_num * sizeof( *dicWord ));
            }
        }
    }

    // second, get the input of text
    int cnt;
    for (;;) {
        scanf("%*[^a-zA-Z0-9_]");
        cnt = scanf("%2047[a-zA-Z0-9_]", tmp);

        if (cnt != 1) {
            break;
        }


        for ( size_t i = 0; i < dic_actual_num; ++i) {

            if (strcmp(dicWord[i], tmp) == 0) {
                dicWcount[i]++;
                break;
            }
        }

    }

    for ( size_t i = 0; i < dic_actual_num; ++i)
        free(dicWord[i]);

    free(dicWord);
    free(dicWcount);
    free(my_string);

    return 0;
}

Answer 1

这个想法很简单：

遍历文本/文件： O（n）
将当前单词添加到已见过的单词/ Hashtable / Map中或者增加这个单词的计数器： n * log（n）
在遍历文件后：获取具有最高计数器的单词： O（1）

最需要的时间是用于存储单词的数据结构。平衡的树（具有最高计数器的词作为根）应该没问题。

我认为不可能减少所需的时间。

Answer 2

以下代码：

干净地编译
速度相当快
执行所需的操作，但排序输出

现在建议的代码：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct myData
{
    char   word[1026];
    size_t count;
};


struct myData dicWord[ 1000 ];  // may need to increase the 1000


int main( void )
{
    size_t dic_actual_num = 0;

    // first, get the input of words
    while ( fgets(dicWord[ dic_actual_num ].word, 1026, stdin) )
    {
        dicWord[ dic_actual_num ].word[ strcspn( dicWord[ dic_actual_num ].word, "\n") ] = '\0';

        if (strncmp( dicWord[ dic_actual_num ].word, "-----", 5 ) == 0)
        {
             break;
        }

        dic_actual_num++;
    }

    // second, get the input of text and check for highest count
    char tmp[1026];  // +2 for newline and NUL byte
    size_t maxCount = 0;

    while( 1 == scanf( "%*[^a-zA-Z0-9_] %1024[a-zA-Z0-9_]", tmp ) )
    {
        for ( size_t i = 0; i < dic_actual_num; ++i)
        {
            if (strcmp(dicWord[i].word, tmp) == 0)
            {
                dicWord[i].count++;
                if( dicWord[i].count > maxCount )
                {
                    maxCount = dicWord[i].count;
                }
                break;
            }
        }
    }

    // << need to add logic for `strcmp()` sorting


    // display answer
    for( size_t i = 0; i<dic_actual_num; i++ )
    {
        if( maxCount == dicWord[i].count )
        {
            printf( "%s %lu\n", dicWord[i].word, maxCount );
        }
    }

    return 0;
}

编辑：添加了替换“scanf（）”

警告：此提议的功能尚未经过测试

#include <ctype.h>
#include <stdio.h>

// prototype:
char *getWord( void );


char *getWord()
{
    static char newWord[1026];

    //memset( newWord, '\0', sizeof( newWord ) );

    size_t charCount = 0;
    int  ch;

    // skip leading junk
    while( (ch = getchar_unlocked) != EOF 
         || '\n' == ch 
         || ispunct( ch ) 
         || isspace( ch ) );

    // append desired characters
    while( charCount < 1024
        && (ch = getchar_unlocked) != EOF 
        && ( '_' == ch || isdigit( ch ) || isalpha( ch ) ) )
    {
        newWord[ charCount ] = ch;
        charCount++;
    }

    // terminate the string
    newWord[ charCount ] = '\0';

    return newWord;
}

减少单词搜索程序的时间复杂度

输入

输出

2 个答案: