问题

Question

问题

当我使用大文本文件（来自项目Gutenberg：Alice In Wonderland）时出现内存问题，但是在一些较小的文本文件中没有出现（两行测试文本文件和Maya Angelou诗））。

在大文本文件中，我收到分段错误，当使用Valgrind时，它会报告“无效写入大小为1”和“无效读取大小为1”。在检查时，它似乎是我写的一个函数，它从传入的行中获取每个单词。在分配大小为50的块之后，似乎抱怨单个地址位置为0字节。

我查看了我的代码，其中我将50个字符的malloc指向char指针，但我不确定在较小的文本文件中出错的大文本文件中出了什么问题。还有什么看起来很奇怪的是当我在我的调试模式下运行时，我把它写到最后并达到我用feof（fp）验证的EOF。

我希望有人能够发现正在发生的事情以及我错过了什么，因为我不经常使用C编程。提前感谢您帮助了解正在发生的事情。

计划概览

我拿出了我正在编写的程序部分并将其放入一个简单的主程序中以使事情更清晰，并希望能够更容易地发现问题。该计划大致分为：

传入文本文件
用“r +”
逐行循环使用fgets
在每行中用'\ 0'替换'\ n'或'\ r'
遍历行并提取每个单词（由isspace（）标记）直到达到'\ 0'
哈希词
使用的免费指针
关闭文件指针

Valgrind输出显示getWord（）函数中发生的问题。我已经看了它并尝试逐个字符地输出它并检查它，但我不知道为什么会发生段错误并且只在大文本文件中。

代码

的main.c

/*
 * License: GPLv3
 *
 * File: main.c
 *
 * Description: A program.
 *
 * Author:  Brandon Authier (hblkr)
 * Date:    6 Aug 2017
 * Version: 1.0
 *
 */

// Includes for program to run
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/types.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>

// Global debug
bool DEBUG = false;

/* A djb2 hash function sourced online.
 */
unsigned long hash(unsigned char *str)
{
    unsigned long hash = 5381;
    int c;

    while (c = *str++)
    hash = ((hash << 5) + hash) + c; /* hash * 33 + c */

    return hash;
}

/*
 * Using a file pointer, get the first word and then pass a copy of the word
 * back
 *
 * @returns: a boolean of true when word is built
 *
 * TODO: Find issue in here causing memory error
 *
 */
bool getWord(char* line, int* idx, char* word)
{
    int wordIdx = 0;

    // Build word character by character
    for ( ; line[*idx] != '\0'; *idx = (*idx + 1))
    {
        if (isalpha(line[*idx]) || (line[*idx] == '-'))
        {
            word[wordIdx++] = tolower(line[*idx]);
        }
        else if (isspace(line[*idx]))
        {
            *idx += 1;
            return true;
        }
    }

    return true;
}

/*
 * Process file. Tokenize each line and process each word.
 *
 * TODO: Process file.
 */
void processFile(FILE* textFp)
{
    // Variables to hold:
    //   a line for text
    //   a word once it is parsed
    //   an index to keep track of the line
    char line[1024] = "";
    unsigned char* word = malloc(sizeof(unsigned char) * 50);
    int* lineIdx = malloc(sizeof(int));
    int lineCount = 1;

    // Set the line index to keep track of the line
    *lineIdx = 0;

    while (fgets(line, sizeof(line), textFp) != NULL)
    {
        // Get line character Count
        int charcount = 0;
        int wordCount = 1;

        for(int m = 0; line[m] != '\0'; m++)
        {
            // By counting spaces, you can get a rough estimate of how many words
            // are in each line. (totalSpaces + 1)
            if ((line[m] == ' ') && (line[m - 1] != ' '))
            {
                 wordCount++;
            }

            if(line[m] != '\n' && line[m] != '\r')
            {
                charcount++;
            }
            else
            {
                line[m] = '\0';
            }
        }

        if (DEBUG == true)
        {
            fprintf(stdout, "line %d:\n", lineCount);
            fprintf(stdout, "  words in line: %d\n", wordCount);
            fprintf(stdout, "  charcount: %d\n", charcount);
            fprintf(stdout, "  lineIdx: %d\n", *lineIdx);
            fprintf(stdout, "  value: \"%s\"\n\n", line);
        }

        // Get word
        while (*lineIdx < (charcount - 1))
        {
            // Sanitize word
            for (int i = 0; i < 50; i++)
            {
                word[i] = '\0';
            }

            getWord(line, lineIdx, word);
            unsigned long hash_output = hash(word);

            if (DEBUG == true)
            {
                fprintf(stdout, "key: %10d,\t", hash_output);
                fprintf(stdout, "value: %8s,\t", word);
                fprintf(stdout, "lineIdx: %2d\n", *lineIdx);
            }
        } // End while for word

        if (DEBUG == true) { fprintf(stdout, "\n========\n\n"); }

        // Reset line index to 0 for new line
        *lineIdx = 0;
        lineCount++;
    } // End while for line

    if (DEBUG == true) { if (feof(textFp)) { fprintf(stderr, "Reached FEOF.\n"); } }

    // Free pointers
    free(lineIdx);
    free(word);
}


// Main
int main (int argc, char* argv[])
{
  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  //        VERIFY COMMAND LINE ARGUMENTS NECESSARY FOR PROGRAM
  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    // User did not pass in any argument
    if (argc == 1)
    {
        fprintf(stderr, "usage: main afile.txt\n");
        exit(-1);
    }

    // Grab text file, possibly turn on debug, and ignore other arguments
    if (argc >= 3)
    {
        // For debug purposes
        if (strcmp("-d", argv[2]) == 0)
        {
            DEBUG = true;
            fprintf(stdout, "+++++++++++++++++++++++++++++++++++++++\n");
            fprintf(stdout, "+            [DEBUGGING ON]           +\n");
            fprintf(stdout, "+++++++++++++++++++++++++++++++++++++++\n\n");
        }
    }


  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  //        PROCESS PASSED IN TEXT FILE
  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    // Open file for reading
    FILE* fp = fopen(argv[1], "r+");

    // If fp is NULL, file does not exist
    if (fp == 0)
    {
        fprintf(stderr, "File does not exist.\n");
        exit(1);
    }
    if (DEBUG == true) { fprintf(stdout, "File exists.\n"); }
    if (DEBUG == true)
    {
        fprintf(stdout, "\n");
        fprintf(stdout, "================================================================================\n");
    }

    // Process file
    processFile(fp);

    // Close file pointer
    if (fclose(fp) != 0)
    {
        fprintf(stderr, "File did not close.\n");
    }
    if (DEBUG == true) { fprintf(stdout, "File closed.\n"); }
    if (DEBUG == true)
    {
        fprintf(stdout, "================================================================================\n");
        fprintf(stdout, "\n");
    }


  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  //        FREE ALL MEMORY THAT HASN'T BEEN FREED YET
  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


    exit(0);
}

以下功能似乎是以某种方式出现问题的地方。

屏幕取词（）

/*
 * Using a file pointer, get the first word and then pass a copy of the word
 * back
 *
 * @returns: a boolean of true when word is built
 *
 * TODO: Find issue in here causing memory error
 *
 */
bool getWord(char* line, int* idx, char* word)
{
    int wordIdx = 0;

    // Build word character by character
    for ( ; line[*idx] != '\0'; *idx = (*idx + 1))
    {
        if (isalpha(line[*idx]) || (line[*idx] == '-'))
        {
            word[wordIdx++] = tolower(line[*idx]);
        }
        else if (isspace(line[*idx]))
        {
            *idx += 1;
            return true;
        }
    }

    return true;
}

错误输出

编译后再运行是我在调试模式下运行时得到的输出（调试实际上只是我自己的详细模式）：

./main alice.txt

Segmentation fault (core dumped)

valgrind -q --leak-check=full ./main alice.txt

==7320== Invalid write of size 1
==7320==    at 0x400A24: getWord (in /tmp/main)
==7320==    by 0x400C7B: processFile (in /tmp/main)
==7320==    by 0x400F32: main (in /tmp/main)
==7320==  Address 0x51f62e2 is 0 bytes after a block of size 50 alloc'd
==7320==    at 0x4C28BF6: malloc (vg_replace_malloc.c:299)
==7320==    by 0x400AE5: processFile (in /tmp/main)
==7320==    by 0x400F32: main (in /tmp/main)
==7320== 
==7320== Invalid read of size 1
==7320==    at 0x400972: hash (in /tmp/main)
==7320==    by 0x400C87: processFile (in /tmp/main)
==7320==    by 0x400F32: main (in /tmp/main)
==7320==  Address 0x51f62e2 is 0 bytes after a block of size 50 alloc'd
==7320==    at 0x4C28BF6: malloc (vg_replace_malloc.c:299)
==7320==    by 0x400AE5: processFile (in /tmp/main)
==7320==    by 0x400F32: main (in /tmp/main)
==7320==

文本文件

以下是我测试过程序的3个：

的test.txt

This isn't only a test, it's a lot of fun!
How did I get-here?... Well, I'm not sure either.

maya.txt

Pretty women wonder where my secret lies.
I'm not cute or built to suit a fashion model's size
But when I start to tell them,
They think I'm telling lies.
I say,
It's in the reach of my arms
The span of my hips,
The stride of my step,
The curl of my lips.
I'm a woman
Phenomenally.
Phenomenal woman,
That's me.

I walk into a room
Just as cool as you please,
And to a man,
The fellows stand or
Fall down on their knees.
Then they swarm around me,
A hive of honey bees.
I say,
It's the fire in my eyes,
And the flash of my teeth,
The swing in my waist,
And the joy in my feet.
I'm a woman
Phenomenally.
Phenomenal woman,
That's me.

Men themselves have wondered
What they see in me.
They try so much
But they can't touch
My inner mystery.
When I try to show them
They say they still can't see.
I say,
It's in the arch of my back,
The sun of my smile,
The ride of my breasts,
The grace of my style.
I'm a woman

Phenomenally.
Phenomenal woman,
That's me.

Now you understand
Just why my head's not bowed.
I don't shout or jump about
Or have to talk real loud.
When you see me passing
It ought to make you proud.
I say,
It's in the click of my heels,
The bend of my hair,
the palm of my hand,
The need of my care,
'Cause I'm a woman
Phenomenally.
Phenomenal woman,
That's me.

alice.txt

以下是text

Answer 1

在comment，Brandon Authier声称发布的代码靠近MCVE（Minimal, Complete, Verifiable Example） - 它只有227行。

我认为227行的文件超过必要的两倍;它不是MCVE。

以下代码保存在文件so-4578-8729-mcve.c中。它有96行，并在使用GCC 7.2.0和Valgrind 3.13.0.SVN运行macOS Sierra 10.12.6的Mac上使用以下命令编译时干净利落地编译：

$ gcc -O3 -g -std=c11 -Wall -Wextra -Werror -Wmissing-prototypes \
>     -Wstrict-prototypes so-4578-8729-mcve.c -o so-4578-8729-mcve
$

它在Valgrind的“爱丽丝梦游仙境”中干净地运行＆＃39;：

$ valgrind --suppressions=etc/suppressions-macos-10.12.5 -- \
>          so-4578-8729-mcve src/data-files/alice-in-wonderland-pg19033.txt 
==12363== Memcheck, a memory error detector
==12363== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==12363== Using Valgrind-3.13.0.SVN and LibVEX; rerun with -h for copyright info
==12363== Command: so-4578-8729-mcve src/data-files/alice-in-wonderland-pg19033.txt
==12363== 
==12363== 
==12363== HEAP SUMMARY:
==12363==     in use at exit: 18,188 bytes in 161 blocks
==12363==   total heap usage: 180 allocs, 19 frees, 28,482 bytes allocated
==12363== 
==12363== LEAK SUMMARY:
==12363==    definitely lost: 0 bytes in 0 blocks
==12363==    indirectly lost: 0 bytes in 0 blocks
==12363==      possibly lost: 0 bytes in 0 blocks
==12363==    still reachable: 0 bytes in 0 blocks
==12363==         suppressed: 18,188 bytes in 161 blocks
==12363== 
==12363== For counts of detected and suppressed errors, rerun with: -v
==12363== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 2 from 2)
$

修复后的代码包含BLUEPIXY中由comment标识的错误修复程序。关于unsigned char vs（plain）char，它也更清晰。它没有调试代码或注释。

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static bool getWord(char *line, int *idx, char *word)
{
    int wordIdx = 0;

    for ( ; line[*idx] != '\0'; *idx = (*idx + 1))
    {
        if (isalpha((unsigned char)line[*idx]) || (line[*idx] == '-'))
        {
            word[wordIdx++] = tolower((unsigned char)line[*idx]);
        }
        else if (isspace((unsigned char)line[*idx]))
        {
            *idx += 1;
            return true;
        }
    }

    return true;
}

static void processFile(FILE *textFp)
{
    char line[1024] = "";
    char *word = malloc(sizeof(unsigned char) * 50);
    int *lineIdx = malloc(sizeof(int));
    int lineCount = 1;

    *lineIdx = 0;

    while (fgets(line, sizeof(line), textFp) != NULL)
    {
        int charcount = 0;
        int wordCount = 1;

        for (int m = 0; line[m] != '\0'; m++)
        {
            if ((line[m] == ' ') && (m == 0 || line[m - 1] != ' '))
            {
                wordCount++;
            }
            if (line[m] != '\n' && line[m] != '\r')
            {
                charcount++;
            }
            else
            {
                line[m] = '\0';
            }
        }

        while (*lineIdx < (charcount - 1))
        {
            for (int i = 0; i < 50; i++)
            {
                word[i] = '\0';
            }
            getWord(line, lineIdx, word);
        }

        *lineIdx = 0;
        lineCount++;
    }

    free(lineIdx);
    free(word);
}

int main(int argc, char *argv[])
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s afile.txt\n", argv[0]);
        exit(-1);
    }

    FILE *fp = fopen(argv[1], "r+");

    if (fp == 0)
    {
        fprintf(stderr, "Failed to open file '%s' for reading\n", argv[1]);
        exit(1);
    }

    processFile(fp);

    if (fclose(fp) != 0)
        fprintf(stderr, "Failed to close file '%s'.\n", argv[1]);

    return(0);
}

这几乎是微不足道的;它仍然可以进一步减少。 -Wmissing-prototypes -Wstrict-prototypes选项要求函数声明static - 或在定义之前声明。由于它们不需要在此源文件之外可见，因此它们是static。这样做的一个优点是编译器可以告诉我hash()的结果未被使用，因此可以删除调用，并且当删除调用时，hash()函数未被使用，因此它可以删除。不是每个人都使用如此严格的编译选项，但我更喜欢他们给我的保证。

Answer 2

反馈

首先，我要感谢大家的反馈，因为大多数人都非常乐于助人。

似乎我的预设是社区很可能将我的代码复制/粘贴到编辑器中然后编译和运行有问题。这个预设让我相信我发布的代码当前数量很好，但Jonathan和Antti都表示他们的偏好和MVCE代码较少。我将确保在未来发布更少的内容，因为这是指南和社区更喜欢的内容。

至于我的问题和答案，这最终要求我回过头来深入研究创建char指针变量的正确方法。从所有反馈中，通过不同的例子，我提出了一个解决方案。

最后，感谢BluePixy和Jonathan，我指出了我的行变量的可能出界区域索引，我能够解决这个问题。不幸的是，这不是问题，但是我忽视了一个很好的问题。

建议的答案

我很欣赏清理过的代码Jonathan，但是你提供的代码仍然以完全相同的方式出错并且读取和写入都是无效的。我完全按照它编写的方式复制/粘贴它并用完全相同的方式编译它命令，但它与我的原始程序有相同的问题。我很确定这是因为这一行：

char *word = malloc(sizeof(unsigned char) * 50);

此外，将“unsigned char”更改为“char”在这里很好，但在我的原始程序中，我有一个带有unsigned char的散列函数，并且保持签名一致是为什么它是unsigned char。然后你必须在getWord（）中转换为unsigned char。但是，这对于这个问题并不重要，因为你可以在周围使用char指针，所以这里很好。

我无法确定为什么它会在您的Mac上运行而在我的Linux发行版上失败。解决我的问题的简短答案是将我的“单词”变量声明为：

unsigned char word[50] = "";

在某个时刻，在分配50个块后，该字为零字节。这是我最初的混淆点，我不明白为什么会发生这种情况，或者为什么使用上面的变量赋值来解决这个问题。很明显，这是因为我对内存管理不满意并且无法正确执行某些操作。如果你知道为什么，我会很感激对错误的解释。

答案

似乎是一个很小的变化：

unsigned char *word = malloc(sizeof(unsigned char) * 50);

为：

unsigned char word[50];
memset(word, '\0', (50 * sizeof(unsigned char)));

第一个是创建一个包含50个字符的无符号字符指针的不正确方法吗？我正在考虑，因为第二个工作并解决了这个问题。我对第一个问题并不了解，这就是问题所在。

固定代码如下：

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static int countCharsAndStripNewline(char* line) {
    int charCount = 0;
    while ((line[charCount] != '\n' && line[charCount] != '\r'))
        charCount++;
    line[charCount] = '\0';
    return charCount;
}

static bool getWord(char *line, int *idx, unsigned char *word) {
    int wordIdx = 0;
    for ( ; (line[*idx] != '\0' && wordIdx < 50); *idx = (*idx + 1)) {
        if (wordIdx == 49) {
            word[wordIdx] = '\0';
            *idx += 1;
            return true;
        }
        if (isalpha(line[*idx]) || (line[*idx] == '-'))
            word[wordIdx++] = tolower(line[*idx]);
        else if (isspace(line[*idx])) {
            *idx += 1;
             return true;
        }
    }
    return true;
}

static void processFile(FILE* textFp) {
    char line[1024] = "";
    unsigned char word[50] = "";
    int* lineIdx = (int*) malloc(sizeof(int));
    *lineIdx = 0;
    memset(word, '\0', (50 * sizeof(unsigned char)));

    while (fgets(line, sizeof(line), textFp) != NULL) {
        if (strcmp("",   line) == 0) { continue; }
        if (strcmp(" ",  line) == 0) { continue; }
        if (strcmp("\0", line) == 0) { continue; }
        if (strcmp("\n", line) == 0) { continue; }
        if (strcmp("\r", line) == 0) { continue; }

        int charcount = countCharsAndStripNewline(line);

        while (*lineIdx < (charcount - 1)) {
            for (int i = 0; i < 50; i++)
                word[i] = '\0';

            getWord(line, lineIdx, word);
        } // End while for word

        *lineIdx = 0;
    } // End while for line

    free(lineIdx);
}

int main(int argc, char *argv[])
{
    if (argc != 2) {
        fprintf(stderr, "Usage: %s afile.txt\n", argv[0]);
        exit(-1);
    }

    FILE *fp = fopen(argv[1], "r+");

    if (fp == 0) {
        fprintf(stderr, "Failed to open file '%s' for reading\n", argv[1]);
        exit(1);
    }

    processFile(fp);

    if (fclose(fp) != 0)
        fprintf(stderr, "Failed to close file '%s'.\n", argv[1]);

    return(0);
}

需要注意的一些事项：为了便于阅读，我添加了一个char计数功能，并将该功能分开。我将关键字static（如Jonathan所指出的）添加到每个函数中，以便它可以保留他使用的严格编译标志，并帮助您摆脱未使用的函数，就像他用hash（）函数指出的那样。这是一个有用的编译功能，即使我的实际程序使用哈希函数并且在此示例中是不可或缺的，但它不是也可以删除。我最小化了代码，将其格式化为使用较少的行，并删除了社区建议的所有内容，因此希望这对人们阅读更好。最后，我添加了一些快速行检查，以跳过不重要的事情。

由于删除了hash（）函数，现在不再出现的另一个有趣的事情就是这条特定行上的“条件跳转”：

while (c = *str++)

获得一些反馈会很棒，但是自从Jonathan剥夺了它并且没有在他的Valgrind检查中遇到它，我将不得不将这个问题保存一天。

关于Char指针的简单C程序中的记忆问题

问题

计划概览

代码

错误输出

文本文件

2 个答案:

反馈

建议的答案

答案