Question

我试图在c中重新创建wc命令，并且在包含机器代码（核心文件或已编译的c）的任何文件中获取正确数量的单词时出现问题。记录单词的数量总是比wc返回的数量少90％。

此处参考是项目信息

编译声明

gcc -ggdb wordCount.c -o wordCount -std=c99

wordCount.c

/*
 *  Author(s)   - Colin McGrath
 *  Description - Lab 3 - WC LINUX
 *  Date        - January 28, 2015
 */

#include<stdio.h>
#include<string.h>
#include<dirent.h>
#include<sys/stat.h>
#include<ctype.h>

struct counterStruct {
    int newlines;
    int words;
    int bt;
};

typedef struct counterStruct ct;

ct totals = {0};

struct stat st;

void wc(ct counter, char *arg)
{
    printf("%6lu %6lu %6lu %s\n", counter.newlines, counter.words, counter.bt, arg);
}

void process(char *arg)
{
    lstat(arg, &st);
    if (S_ISDIR(st.st_mode))
    {
        char message[4056] = "wc: ";
        strcat(message, arg);
        strcat(message, ": Is a directory\n");
        printf(message);
        ct counter = {0};
        wc(counter, arg);
    }
    else if (S_ISREG(st.st_mode))
    {
        FILE *file;
        file = fopen(arg, "r");
        ct currentCount = {0};
        if (file != NULL)
        {
            char holder[65536];
            while (fgets(holder, 65536, file) != NULL)
            {
                totals.newlines++;
                currentCount.newlines++;
                int c = 0;
                for (int i=0; i<strlen(holder); i++)
                {
                    if (isspace(holder[i]))
                    {
                        if (c != 0)
                        {
                            totals.words++;
                            currentCount.words++;
                            c = 0;
                        }
                    }
                    else
                        c = 1;
                }
            }
        }
        currentCount.bt = st.st_size;
        totals.bt = totals.bt + st.st_size;
        wc(currentCount, arg);
    }
}

int main(int argc, char *argv[])
{
    if (argc > 1)
    {
        for (int i=1; i<argc; i++)
        {
            //printf("%s\n", argv[i]);
            process(argv[i]);
        }
    }

    wc(totals, "total");

    return 0;
}

示例wc输出：

    135     742  360448 /home/cpmcgrat/53/labs/lab-2/core.22321
    231    1189  192512 /home/cpmcgrat/53/labs/lab-2/core.26554
   5372   40960  365441 /home/cpmcgrat/53/labs/lab-2/file
     24     224   12494 /home/cpmcgrat/53/labs/lab-2/frequency
     45     116     869 /home/cpmcgrat/53/labs/lab-2/frequency.c
   5372   40960  365441 /home/cpmcgrat/53/labs/lab-2/lineIn
     12      50    1013 /home/cpmcgrat/53/labs/lab-2/lineIn2
      0       0       0 /home/cpmcgrat/53/labs/lab-2/lineOut
     39     247   11225 /home/cpmcgrat/53/labs/lab-2/parseURL
    138     318    2151 /home/cpmcgrat/53/labs/lab-2/parseURL.c
     41     230   10942 /home/cpmcgrat/53/labs/lab-2/roman
     66     162    1164 /home/cpmcgrat/53/labs/lab-2/roman.c
     13      13      83 /home/cpmcgrat/53/labs/lab-2/romanIn
     13      39     169 /home/cpmcgrat/53/labs/lab-2/romanOut
      7       6     287 /home/cpmcgrat/53/labs/lab-2/URLs
  11508   85256 1324239 total

示例重建输出（./wordCount）：

   139     76 360448 /home/cpmcgrat/53/labs/lab-2/core.22321
   233    493 192512 /home/cpmcgrat/53/labs/lab-2/core.26554
  5372  40960 365441 /home/cpmcgrat/53/labs/lab-2/file
    25      3  12494 /home/cpmcgrat/53/labs/lab-2/frequency
    45    116    869 /home/cpmcgrat/53/labs/lab-2/frequency.c
  5372  40960 365441 /home/cpmcgrat/53/labs/lab-2/lineIn
    12     50   1013 /home/cpmcgrat/53/labs/lab-2/lineIn2
     0      0      0 /home/cpmcgrat/53/labs/lab-2/lineOut
    40      6  11225 /home/cpmcgrat/53/labs/lab-2/parseURL
   138    318   2151 /home/cpmcgrat/53/labs/lab-2/parseURL.c
    42      3  10942 /home/cpmcgrat/53/labs/lab-2/roman
    66    162   1164 /home/cpmcgrat/53/labs/lab-2/roman.c
    13     13     83 /home/cpmcgrat/53/labs/lab-2/romanIn
    13     39    169 /home/cpmcgrat/53/labs/lab-2/romanOut
     7      6    287 /home/cpmcgrat/53/labs/lab-2/URLs
 11517  83205 1324239 total

注意字数（第二个int）与前两个文件（核心文件）以及roman文件和parseURL文件（机器代码，没有扩展名）的区别。

Answer 1

C字符串不存储它们的长度。它们由单个NUL（0）字节终止。

因此，strlen需要逐个字符地扫描整个字符串，直到它到达NUL。这就是：

for (int i=0; i<strlen(holder); i++)

极度低效：对于holder中的每个字符，都需要计算holder中的所有字符，以便测试i是否仍在范围内。这会将简单的线性Θ(N)算法转换为Θ(N²)周期刻录机。

但在这种情况下，它也会产生错误的结果，因为二进制文件通常包含许多NUL个字符。由于strlen实际上会告诉您第一个NUL的位置，而不是“行”的长度，因此您最终会跳过文件中的大量字节。（从好的方面来说，这使得扫描在二次方面变得更快，但更快地计算错误结果并不是真正的胜利。）

您不能使用fgets来读取二进制文件，因为fgets接口不会告诉您它读取了多少。您可以使用Posix 2008 getline界面，也可以使用fread进行二进制输入，这样效率更高但会强制您自行计算换行符。（这不是世界上最糟糕的事情;你似乎也错了。）

或者，当然，您可以使用fgetc一次读取一个字符的文件。对于学校运动来说，这不是一个糟糕的解决方案;生成的代码易于编写和理解，fgetc的典型实现比FUD指示的更有效。

有问题迭代机器代码

1 个答案: