Question

嘿伙计们我试图读取workersinfo.txt并将其存储到二维char数组中。该文件大约有4,000,000行，每行约100个字符。我想将每个文件行存储在数组上。不幸的是，我得到退出代码139（内存不足）。我知道我必须使用malloc()和free()，但我已经尝试了几件事情，但我还没有能够让它们发挥作用。最后我必须按ID号对数组进行排序，但我仍然坚持声明数组。该文件看起来像这样：

First Name, Last Name,Age, ID
Carlos,Lopez,,10568
Brad, Patterson,,20586
Zack, Morris,42,05689

到目前为止，这是我的代码：

#include <stdio.h>
#include <stdlib.h>

int main(void) {


FILE *ptr_file;
char workers[4000000][1000];




ptr_file =fopen("workersinfo.txt","r");
if (!ptr_file)
    perror("Error");


int i = 0;
while (fgets(workers[i],1000, ptr_file)!=NULL){

    i++;
}

int n;
for(n = 0; n < 4000000; n++)
{
    printf("%s", workers[n]);
}

fclose(ptr_file);
return 0;
}

Answer 1

堆栈内存有限。正如你在问题中所指出的那样，你必须使用malloc来分配这么大的（需要我说是巨大的）内存块，因为堆栈不能包含它。

您可以使用ulimit查看系统的限制（通常包括堆栈大小限制）。

在我的Mac上，限制为8Mb。运行ulimit -a后，我得到：

...
stack size              (kbytes, -s) 8192
...

或者，使用以下方法测试限制：

struct rlimit slim;
getrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur // the stack limit

我真的建议您单独处理每个数据库条目。

正如评论中所提到的，在大多数实现中，将内存分配为静态内存会绕过堆栈。

仍然，恕我直言，分配400MB的内存（或4GB，取决于你看到的问题的哪一部分），除非完全需要，否则是不好的形式 - 特别是对于单个功能。

后续Q1：如何单独处理每个数据库条目

我希望我没有做你的家庭作业或任何事情......但我怀疑你的家庭作业会包含一项将400Mb数据加载到电脑记忆中的任务......所以...回答你评论中的问题：

下面的单一输入处理草图并不完美 - 每个条目限制为1Kb的数据（我认为这对于这样的简单数据来说已经足够了）。

另外，我没有允许UTF-8编码或类似的东西（我遵循假设会使用英语）。

从代码中可以看出，我们分别读取每一行并执行错误检查以检查数据是否有效。

要按ID对文件进行排序，您可能会考虑一次运行两行（这将是一个慢速排序）并对它们进行排序，或创建一个带有ID数据和位置的排序node树。文件中的行（在读取行之前获取位置）。对二叉树进行排序后，可以对数据进行排序......

...二叉树可能会有点大。你查了sorting algorithms吗？

#include <stdio.h>

// assuming this is the file structure:
//
// First Name, Last Name,Age, ID
// Carlos,Lopez,,10568
// Brad, Patterson,,20586
// Zack, Morris,42,05689
//
// Then this might be your data structure per line:
struct DBEntry {
  char* last_name;        // a pointer to the last name
  char* age;              // a pointer to the name - could probably be an int
  char* id;               // a pointer to the ID
  char first_name[1024];  // the actual buffer...
  // I unified the first name and the buffer since the first name is first.
};

// each time you read only a single line, perform an error check for overflow
// and return the parsed data.
//
// return 1 on sucesss or 0 on failure.
int read_db_line(FILE* fp, struct DBEntry* line) {
  if (!fgets(line->first_name, 1024, fp))
    return 0;
  // parse data and review for possible overflow.

  // first, zero out data
  int pos = 0;
  line->age = NULL;
  line->id = NULL;
  line->last_name = NULL;

  // read each byte, looking for the EOL marker and the ',' seperators
  while (pos < 1024) {
    if (line->first_name[pos] == ',') {
      // we encountered a devider. we should handle it.

      // if the ID feild's location is already known, we have an excess comma.
      if (line->id) {
        fprintf(stderr, "Parsing error, invalid data - too many fields.\n");
        return 0;
      }
      // replace the comma with 0 (seperate the strings)
      line->first_name[pos] = 0;
      if (line->age)
        line->id = line->first_name + pos + 1;
      else if (line->last_name)
        line->age = line->first_name + pos + 1;
      else
        line->last_name = line->first_name + pos + 1;
    } else if (line->first_name[pos] == '\n') {
      // we encountered a terminator. we should handle it.
      if (line->id) {
        // if we have the id string's possition (the start marker), this is a
        // valid entry and we should process the data.
        line->first_name[pos] = 0;
        return 1;
      } else {
        // we reached an EOL without enough ',' seperators, this is an invalid
        // line.
        fprintf(stderr, "Parsing error, invalid data - not enough fields.\n");
        return 0;
      }
    }
    pos++;
  }
  // we ran through all the data but there was no EOL marker...
  fprintf(stderr,
          "Parsing error, invalid data (data overflow or data too large).\n");
  return 0;
}

// the main program
int main(int argc, char const* argv[]) {
  // open file
  FILE* ptr_file;
  ptr_file = fopen("workersinfo.txt", "r");
  if (!ptr_file)
    perror("File Error");

  struct DBEntry line;

  while (read_db_line(ptr_file, &line)) {
    // do what you want with the data... print it?
    printf(
        "First name:\t%s\n"
        "Last name:\t%s\n"
        "Age:\t\t%s\n"
        "ID:\t\t%s\n"
        "--------\n",
        line.first_name, line.last_name, line.age, line.id);
  }

  // close file
  fclose(ptr_file);
  return 0;
}

后续Q2：为400MB-4GB数据排序数组

恕我直言，400MB已经涉及与大数据相关的问题。例如，就性能而言，在数据库上实施冒泡排序应该是令人痛苦的（除非它是一次性任务，性能可能无关紧要。）

创建一个DBEntry对象数组最终会让你获得比实际数据更大的内存占用量。

这将不是对大数据进行排序的最佳方式。

正确的方法取决于您的排序算法。 Wikipedia has a decent primer on sorting algorythms

由于我们正在处理大量数据，因此需要考虑以下几点：

分割工作是有意义的，因此不同的线程/进程会对数据的不同部分进行排序。
我们需要将IO最小化到硬盘驱动器（因为它会显着减慢排序速度，并阻止在同一台机器/磁盘上进行并行处理。）

一种可能的方法是为堆排序创建堆，但只存储优先级值并将原始位置存储在文件中。

另一个选择可能是采用分而治之的算法，例如quicksort，再次，只对计算的排序值和条目在原始文件中的位置进行排序。

无论哪种方式，编写一个像样的排序方法将是一项复杂的任务，可能涉及线程，分叉，临时文件或其他技术。

这是一个简化的演示代码......它远未优化，但它演示了二进制排序树的概念，它保存了文件中的排序值和数据位置。

请注意，使用此代码的速度相对较慢（尽管速度不慢）和内存密集型......

另一方面，每个条目需要大约24个字节。对于400万个条目，它是96MB，比400Mb好一些，肯定比4GB好。

#include <stdlib.h>
#include <stdio.h>

// assuming this is the file structure:
//
//    First Name, Last Name,Age, ID
//    Carlos,Lopez,,10568
//    Brad, Patterson,,20586
//    Zack, Morris,42,05689
//
// Then this might be your data structure per line:
struct DBEntry {
  char* last_name;        // a pointer to the last name
  char* age;              // a pointer to the name - could probably be an int
  char* id;               // a pointer to the ID
  char first_name[1024];  // the actual buffer...
  // I unified the first name and the buffer since the first name is first.
};

// this might be a sorting node for a sorted bin-tree:
struct SortNode {
  struct SortNode* next;  // a pointer to the next node
  fpos_t position;        // the DB entry's position in the file
  long value;             // The computed sorting value
}* top_sorting_node = NULL;

// this function will free all the memory used by the global Sorting tree
void clear_sort_heap(void) {
  struct SortNode* node;
  // as long as there is a first node...
  while ((node = top_sorting_node)) {
    // step forward.
    top_sorting_node = top_sorting_node->next;
    // free the original first node's memory
    free(node);
  }
}
// each time you read only a single line, perform an error check for overflow
// and return the parsed data.
//
// return 0 on sucesss or 1 on failure.
int read_db_line(FILE* fp, struct DBEntry* line) {
  if (!fgets(line->first_name, 1024, fp))
    return -1;
  // parse data and review for possible overflow.

  // first, zero out data
  int pos = 0;
  line->age = NULL;
  line->id = NULL;
  line->last_name = NULL;

  // read each byte, looking for the EOL marker and the ',' seperators
  while (pos < 1024) {
    if (line->first_name[pos] == ',') {
      // we encountered a devider. we should handle it.

      // if the ID feild's location is already known, we have an excess comma.
      if (line->id) {
        fprintf(stderr, "Parsing error, invalid data - too many fields.\n");
        clear_sort_heap();
        exit(2);
      }
      // replace the comma with 0 (seperate the strings)
      line->first_name[pos] = 0;
      if (line->age)
        line->id = line->first_name + pos + 1;
      else if (line->last_name)
        line->age = line->first_name + pos + 1;
      else
        line->last_name = line->first_name + pos + 1;
    } else if (line->first_name[pos] == '\n') {
      // we encountered a terminator. we should handle it.
      if (line->id) {
        // if we have the id string's possition (the start marker), this is a
        // valid entry and we should process the data.
        line->first_name[pos] = 0;
        return 0;
      } else {
        // we reached an EOL without enough ',' seperators, this is an invalid
        // line.
        fprintf(stderr, "Parsing error, invalid data - not enough fields.\n");
        clear_sort_heap();
        exit(1);
      }
    }
    pos++;
  }
  // we ran through all the data but there was no EOL marker...
  fprintf(stderr,
          "Parsing error, invalid data (data overflow or data too large).\n");
  return 0;
}

// read and sort a single line from the database.
// return 0 if there was no data to sort. return 1 if data was read and sorted.
int sort_line(FILE* fp) {
  // allocate the memory for the node - use calloc for zero-out data
  struct SortNode* node = calloc(sizeof(*node), 1);
  // store the position on file
  fgetpos(fp, &node->position);
  // use a stack allocated DBEntry for processing
  struct DBEntry line;
  // check that the read succeeded (read_db_line will return -1 on error)
  if (read_db_line(fp, &line)) {
    // free the node's memory
    free(node);
    // return no data (0)
    return 0;
  }
  // compute sorting value - I'll assume all IDs are numbers up to long size.
  sscanf(line.id, "%ld", &node->value);

  // heap sort?

  // This is a questionable sort algorythm... or a questionable implementation.
  // Also, I'll be using pointers to pointers, so it might be a headache to read
  // (it's a headache to write, too...) ;-)
  struct SortNode** tmp = &top_sorting_node;
  // move up the list until we encounter something we're smaller then us,
  // OR untill the list is finished.
  while (*tmp && (*tmp)->value <= node->value)
    tmp = &((*tmp)->next);
  // update the node's `next` value.
  node->next = *tmp;
  // inject the new node into the tree at the position we found
  *tmp = node;
  // return 1 (data was read and sorted)
  return 1;
}

// writes the next line in the sorting
int write_line(FILE* to, FILE* from) {
  struct SortNode* node = top_sorting_node;
  if (!node)   // are we done? top_sorting_node == NULL ?
    return 0;  // return 0 - no data to write
  // step top_sorting_node forward
  top_sorting_node = top_sorting_node->next;
  // read data from one file to the other
  fsetpos(from, &node->position);
  char* buffer = NULL;
  ssize_t length;
  size_t buff_size = 0;
  length = getline(&buffer, &buff_size, from);
  if (length <= 0) {
    perror("Line Copy Error - Couldn't read data");
    return 0;
  }
  fwrite(buffer, 1, length, to);
  free(buffer);  // getline allocates memory that we're incharge of freeing.
  return 1;
}

// the main program
int main(int argc, char const* argv[]) {
  // open file
  FILE *fp_read, *fp_write;
  fp_read = fopen("workersinfo.txt", "r");
  fp_write = fopen("sorted_workersinfo.txt", "w+");
  if (!fp_read) {
    perror("File Error");
    goto cleanup;
  }
  if (!fp_write) {
    perror("File Error");
    goto cleanup;
  }

  printf("\nSorting");
  while (sort_line(fp_read))
    printf(".");
  // write all sorted data to a new file
  printf("\n\nWriting sorted data");
  while (write_line(fp_write, fp_read))
    printf(".");
// clean up - close files and make sure the sorting tree is cleared
cleanup:
    printf("\n");
  fclose(fp_read);
  fclose(fp_write);
  clear_sort_heap();
  return 0;
}

二维字符数组太大退出代码139

1 个答案:

后续Q1：如何单独处理每个数据库条目

后续Q2：为400MB-4GB数据排序数组