Question

我需要添加代码以删除重复的单词，在检查时（例如，在文本中有几个单词'book'并且它显示为重复的单词，如书本），在编译时输出。这本词典有140 000字。请告诉我如何检查唯一性，以便将不相同的单词写入'uniq'数组

#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <string.h>

#include "dictionary.h"
#undef calculate
#undef getrusage

// default dictionary
#define DICTIONARY "dictionaries/large"

// prototype
double calculate(const struct rusage* b, const struct rusage* a);

int main(int argc, char* argv[])
{
    // check for correct number of args
    if (argc != 2 && argc != 3)
    {
        printf("Usage: speller [dictionary] text\n");
        return 1;
    }

    // structs for timing data
    struct rusage before, after;

    // benchmarks
    double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;

    // determine dictionary to use
    char* dictionary = (argc == 3) ? argv[1] : DICTIONARY;

    // load dictionary
    getrusage(RUSAGE_SELF, &before);
    bool loaded = load(dictionary);
    getrusage(RUSAGE_SELF, &after);

    // abort if dictionary not loaded
    if (!loaded)
    {
        printf("Could not load %s.\n", dictionary);
        return 1;
    }

    // calculate time to load dictionary
    time_load = calculate(&before, &after);

    // try to open text
    char* text = (argc == 3) ? argv[2] : argv[1];

    //The variable fp contains the literary text
    FILE* fp = fopen(text, "r");

    if (fp == NULL)
    {
        printf("Could not open %s.\n", text);
        unload();
        return 1;
    }

    // prepare to report misspellings
    printf("\nMISSPELLED WORDS\n\n");

    // prepare to spell-check
    int index = 0, misspellings = 0, words = 0;
    char word[LENGTH+1];

    //uniq_array
    char uniq[300][50];

    //count no uniq words
    int countsUniq = 0;

    // spell-check each word in text
    for (int c = fgetc(fp); c != EOF; c = fgetc(fp))
    {
        // allow only alphabetical characters and apostrophes
        if (isalpha(c) || (c == '\'' && index > 0))
        {
            // append character to word
            word[index] = c;
            index++;

            // ignore alphabetical strings too long to be words
            if (index > LENGTH)
            {
                // consume remainder of alphabetical string
                while ((c = fgetc(fp)) != EOF && isalpha(c));

                // prepare for new word
                index = 0;
            }
        }

        // ignore words with numbers (like MS Word can)
        else if (isdigit(c))
        {
            // consume remainder of alphanumeric string
            while ((c = fgetc(fp)) != EOF && isalnum(c));

            // prepare for new word
            index = 0;
        }

        // we must have found a whole word
        else if (index > 0)
        {
            // terminate current word
            word[index] = '\0';

            // update counter
            words++;

            // check word's spelling
            getrusage(RUSAGE_SELF, &before);
            bool misspelled = !check(word);
            getrusage(RUSAGE_SELF, &after);

            // update benchmark
            time_check += calculate(&before, &after);


            // prepare for next word
            index = 0;
            // print word if misspelled
            if (misspelled)
            {
                //here takes place check for uniqueness and record to an array
                misspellings++;
                for(int j = 0; j < 300; j++){
                    if(strcmp(uniq[j], word) == 0){
                        countsUniq++;
                        break;
                    }
                }
                if(countsUniq == 0){
                    for(int i = 0; i < 300; i++){
                        if(strcmp(uniq[i], "") == 0){
                            strcpy(uniq[i], word);
                            break;
                        }
                    }
                }
                countsUniq = 0;
            }
        }
    }

    for(int i = 0; i < 300; i++){
       printf("%s\n", uniq[i]);
    }

    // check whether there was an error
    if (ferror(fp))
    {
        fclose(fp);
        printf("Error reading %s.\n", text);
        unload();
        return 1;
    }

    // close text
    fclose(fp);

    // determine dictionary's size
    getrusage(RUSAGE_SELF, &before);
    unsigned int n = size();
    getrusage(RUSAGE_SELF, &after);

    // calculate time to determine dictionary's size
    time_size = calculate(&before, &after);

    // unload dictionary
    getrusage(RUSAGE_SELF, &before);
    bool unloaded = unload();
    getrusage(RUSAGE_SELF, &after);

    // abort if dictionary not unloaded
    if (!unloaded)
    {
        printf("Could not unload %s.\n", dictionary);
        return 1;
    }

    // calculate time to unload dictionary
    time_unload = calculate(&before, &after);

    // report benchmarks
    printf("\nWORDS MISSPELLED:     %d\n", misspellings);
    printf("WORDS IN DICTIONARY:  %d\n", n);
    printf("WORDS IN TEXT:        %d\n", words);
    printf("TIME IN load:         %.2f\n", time_load);
    printf("TIME IN check:        %.2f\n", time_check);
    printf("TIME IN size:         %.2f\n", time_size);
    printf("TIME IN unload:       %.2f\n", time_unload);
    printf("TIME IN TOTAL:        %.2f\n\n",
     time_load + time_check + time_size + time_unload);

    return 0;
}

/**
 * Returns number of seconds between b and a.
 */
double calculate(const struct rusage* b, const struct rusage* a)
{
    if (b == NULL || a == NULL)
    {
        return 0.0;
    }
    else
    {
        return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
                 (b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
                ((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
                 (b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
                / 1000000.0);
    }
}

在输出中，我收到了一些单词和符号形式的列表，不应该到达那里，有些行显示为空，我也不知道为什么：

nonproprietary
s
F
IS'
MERCHANTIBILITY
unenforceability




Q@
<
=
@

提前感谢您的帮助。

Answer 1

编译字符串通常通过库函数strcmp完成。无法通过==运算符比较字符串。

此外，=运营商无法分配字符串。

uniq[misspellings][j] = word[j]; // it will not work

使用strcpy复制字符串。

这是一个简单的程序来说明上述概念。可以帮助您理解您的问题。

#include <stdio.h>
#include <string.h>

#define LENGTH 30
#define ROWS    5

int main(void) {

    char word[LENGTH+1] = "word";
    char uniq[ROWS][LENGTH+1] = { "eva", "buba", "word" , "1235",  "stop"};

    for(int j = 0; j < ROWS; j++){

        if(strcmp(uniq[j], word) == 0) // find word
        {
            printf("We have found: <%s>\n", word);
            // replacing with "hello":
            strcpy(uniq[j], "hello");
        }
    }

    printf("Strings in uniq:\n", word);
    for(int j = 0; j < ROWS; j++){
        printf("%s ", uniq[j]);
    }

  return 0;
}

输出：

We have found: <word>                                                                                                                         
Strings in uniq:                                                                                                                              
eva buba hello 1235 stop

记录数组

1 个答案: