减少单词搜索程序的时间复杂度

时间:2018-01-08 09:12:52

标签: c time-complexity

我试图查找文本中最常用的单词。在我的程序中,判断系统(PC ^ 2)将输入一些单词,分隔符和一些文本。单词和文本由" -----"分开。 (我需要在我的程序中搜索最常用的单词)。

输入

apple
an
banana
-----
i like an apple.
i also like apple-pie.
i like banana
i like banana+ice.

输出

apple 2
banana 2
  • 不是数字,字母和' _'应该被视为空间
  • 每行的最长长度为1024
  • 输出顺序应为strcmp顺序

然而,虽然该程序在我测试时运行良好。我从PC ^ 2获得超时限制(TLE)。因此,我想问一下如何才能提高程序的时间复杂度?在显示最频繁的单词及其频率之前,我甚至得到了TLE。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char tmp[2048];
char **dicWord;
int *dicWcount;
int dic_assume_num = 1000, dic_actual_num = 0;

int main()
{
    char divider[6] = "-----";
    ssize_t bytes_read;
    size_t nbytes = 2047;
    char *my_string;

    // first, get the input of words
    dicWord = malloc( dic_assume_num * sizeof( *dicWord ));
    my_string = malloc((nbytes + 1) * sizeof(*my_string));

    while ((bytes_read = getline (&my_string, &nbytes, stdin)) != -1) {
        my_string[strcspn(my_string, "\n")] = '\0';

        if (strncmp(my_string, divider, 5) == 0) {
            dicWcount = calloc(dic_actual_num+1,  sizeof(*dicWcount));
            break;
        }

        else {
            dicWord[dic_actual_num++] = strdup(my_string);
            if (dic_actual_num >= dic_assume_num) {
                dic_assume_num *= 2;
                dicWord = realloc( dicWord, dic_assume_num * sizeof( *dicWord ));
            }
        }
    }

    // second, get the input of text
    int cnt;
    for (;;) {
        scanf("%*[^a-zA-Z0-9_]");
        cnt = scanf("%2047[a-zA-Z0-9_]", tmp);

        if (cnt != 1) {
            break;
        }


        for ( size_t i = 0; i < dic_actual_num; ++i) {

            if (strcmp(dicWord[i], tmp) == 0) {
                dicWcount[i]++;
                break;
            }
        }

    }

    for ( size_t i = 0; i < dic_actual_num; ++i)
        free(dicWord[i]);

    free(dicWord);
    free(dicWcount);
    free(my_string);

    return 0;
}

2 个答案:

答案 0 :(得分:0)

这个想法很简单:

  • 遍历文本/文件: O(n)
  • 将当前单词添加到已见过的单词/ Hashtable / Map中 或者增加这个单词的计数器: n * log(n)
  • 在遍历文件后:获取具有最高计数器的单词: O(1)

最需要的时间是用于存储单词的数据结构。平衡的树(具有最高计数器的词作为根)应该没问题。

我认为不可能减少所需的时间。

答案 1 :(得分:0)

以下代码:

  1. 干净地编译
  2. 速度相当快
  3. 执行所需的操作,但排序输出
  4. 除外

    现在建议的代码:

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    struct myData
    {
        char   word[1026];
        size_t count;
    };
    
    
    struct myData dicWord[ 1000 ];  // may need to increase the 1000
    
    
    int main( void )
    {
        size_t dic_actual_num = 0;
    
        // first, get the input of words
        while ( fgets(dicWord[ dic_actual_num ].word, 1026, stdin) )
        {
            dicWord[ dic_actual_num ].word[ strcspn( dicWord[ dic_actual_num ].word, "\n") ] = '\0';
    
            if (strncmp( dicWord[ dic_actual_num ].word, "-----", 5 ) == 0)
            {
                 break;
            }
    
            dic_actual_num++;
        }
    
        // second, get the input of text and check for highest count
        char tmp[1026];  // +2 for newline and NUL byte
        size_t maxCount = 0;
    
        while( 1 == scanf( "%*[^a-zA-Z0-9_] %1024[a-zA-Z0-9_]", tmp ) )
        {
            for ( size_t i = 0; i < dic_actual_num; ++i)
            {
                if (strcmp(dicWord[i].word, tmp) == 0)
                {
                    dicWord[i].count++;
                    if( dicWord[i].count > maxCount )
                    {
                        maxCount = dicWord[i].count;
                    }
                    break;
                }
            }
        }
    
        // << need to add logic for `strcmp()` sorting
    
    
        // display answer
        for( size_t i = 0; i<dic_actual_num; i++ )
        {
            if( maxCount == dicWord[i].count )
            {
                printf( "%s %lu\n", dicWord[i].word, maxCount );
            }
        }
    
        return 0;
    }
    

    编辑:添加了替换“scanf()”

    调用的函数

    警告:此提议的功能尚未经过测试

    #include <ctype.h>
    #include <stdio.h>
    
    // prototype:
    char *getWord( void );
    
    
    char *getWord()
    {
        static char newWord[1026];
    
        //memset( newWord, '\0', sizeof( newWord ) );
    
        size_t charCount = 0;
        int  ch;
    
        // skip leading junk
        while( (ch = getchar_unlocked) != EOF 
             || '\n' == ch 
             || ispunct( ch ) 
             || isspace( ch ) );
    
        // append desired characters
        while( charCount < 1024
            && (ch = getchar_unlocked) != EOF 
            && ( '_' == ch || isdigit( ch ) || isalpha( ch ) ) )
        {
            newWord[ charCount ] = ch;
            charCount++;
        }
    
        // terminate the string
        newWord[ charCount ] = '\0';
    
        return newWord;
    }