Question

我想要一个函数，它将文件名和字符串向量作为输入，并通过有效读取文件逐行填充向量。这是我到目前为止所做的：

/** \brief Read the whole file in a vector of lines
 */
  int
  readFile(
    const string & pathToFile,
    vector<string> & lines)
  {
    gzFile stream;
    openFile(pathToFile, stream, "rb");

    int errnum;
    const char * error_msg = NULL;

    size_t nb_bytes_to_read = 256000; // 8192 is default for gzbuffer
    if(gzbuffer(stream, nb_bytes_to_read) == -1){
      error_msg = gzerror(stream, &errnum);
      if(errnum != Z_OK){
        cerr << "ERROR: gzbuffer failed with " << nb_bytes_to_read
             << " bytes" << endl;
        cerr << error_msg << endl;
        exit(EXIT_FAILURE);
      }
    }

    size_t buf_len = nb_bytes_to_read;
    char * buf = (char *) malloc(buf_len);
    if(buf == NULL){
      cerr << "ERROR: can't allocate " << nb_bytes_to_read
           << " bytes" << endl;
      exit(EXIT_FAILURE);
    }

    size_t nb_bytes_read = 0, tot_nb_bytes_read = 0;
    while(! gzeof(stream)){
      nb_bytes_read = gzread(stream, buf + tot_nb_bytes_read,
                             nb_bytes_to_read);
      tot_nb_bytes_read += nb_bytes_read;
      if(nb_bytes_read < nb_bytes_to_read && ! gzeof(stream)){
        error_msg = gzerror(stream, &errnum);
        if(errnum != Z_OK){
          cerr << "ERROR: gzread failed on " << pathToFile << endl;
          cerr << error_msg << endl;
          exit(EXIT_FAILURE);
        }
      }
      if(tot_nb_bytes_read == buf_len){
        buf_len += nb_bytes_to_read;
        buf = (char*) realloc(buf, buf_len);
        if(buf == NULL){
          cerr << "ERROR: can't allocate " << nb_bytes_to_read
               << " bytes" << endl;
          exit(EXIT_FAILURE);
        }
      }
    }

    closeFile(pathToFile, stream);

    lines = split(buf, "\n", lines);

    free(buf);

    return 0;
  }

gzread void openFile( const string & pathToFile, gzFile & fileStream, const char * mode) { fileStream = gzopen(pathToFile.c_str(), mode); if(fileStream == NULL){ cerr << "ERROR: can't open file " << pathToFile << " with mode " << *mode << " (errno=" << errno << ")" << endl; exit(EXIT_FAILURE); } } void closeFile( const string & pathToFile, gzFile & fileStream) { int ret = gzclose(fileStream); if(ret != Z_OK){ cerr << "ERROR: can't close the file " << pathToFile << ", gzclose() returned " << ret << endl; exit(EXIT_FAILURE); } } vector<string> & split( char * buf, const char * delim, vector<string> & tokens) { tokens.clear(); char * pch; pch = strtok(buf, delim); while(pch != NULL){ tokens.push_back(string(pch)); pch = strtok(NULL, delim); } return tokens; }的文档提到：“如果在gzip流之后遇到gzip流以外的其他内容，则会忽略剩余的尾随垃圾（并且不会返回错误）”。但是，对于某些文件，我上面的代码显示“一行太远”。更具体地说，输出“行”向量具有N个元素，而输入文件具有N-1行。结果，“行”向量的最后一个元素可以是“\ 223（\305ĿV”）。

我该如何解决？

以下是上述代码中使用的其他函数：

{{1}}

（由于我不是专业程序员，欢迎任何其他建议！）

Answer 1

strtok()对以null结尾的字符串进行操作。您正在提供缓冲区读取，可能是文本文件。没有空。所以strtok()正在读取缓冲区的末尾，直到它在内存中发现意外为零。

顺便说一句，strtok()有问题，甚至不是可重入的。阅读strtok和strsep的手册页。

使用gzbuffer快速读取gzip压缩文件，然后逐行分割内容

1 个答案: