使用C ++快速读取文件,使用mmap()和std :: cin()性能结果解释比较不同策略

时间:2019-03-27 14:36:26

标签: c++ linux performance cin mmap

在对我提出的要关闭映射文件(c++ close a open() file read with mmap)的问题提出建议后,我进行了一些比较,并注意到,正如某些用户所建议的那样,std :: cin缓冲区方法的性能类似于我的映射方法。

我决定进行性能比较:每个脚本都会打开一个包含其他文件路径的文件(大约3500个),读取该文件并获取10个随机路径,然后打开所有这10个文件(每行500行,每个大约700个字符)行),并随机读取其中的换行数1000次。

我的原始版本是一种映射方法(但是最后没有关闭文件,并且在打开300-400个文件后出现错误,在这种情况下没有实现(c++ close a open() file read with mmap)。

MMAP.OPEN(v0)

#include <algorithm>
#include <iostream>
#include <cstring>
#include <vector>
#include <set>
#include <typeinfo>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <sstream>
#include <unistd.h>

const char* map_file(const char* fname, size_t& length);
void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector);
void gnomadSubfileAnalysis( std::string &nomeFile );
void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );

int main() {

      //this take the INDEX file (with the paths to the athor) and populate these vectors
      std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c/";
      std::string nomeChr = "1";
      std::stringstream streamNomeGnomadChrDir;
      streamNomeGnomadChrDir << gnomadSplitDir << "ex_" << nomeChr << "_5c/" << "chr_" << nomeChr << "/";
      std::string pathGnomadChrDir = streamNomeGnomadChrDir.str();
      std::stringstream streamNomeGnomadChrIndex;
      streamNomeGnomadChrIndex << pathGnomadChrDir << "chr_" << nomeChr << ".txt";
      std::string pathGnomadChrIndex = streamNomeGnomadChrIndex.str();
      std::vector<std::string> vSubfileNames;
      std::vector<int> vSubfileStarts;
      std::vector<int> vSubfileStops;
      std::vector<std::vector<int>> vSubfilePosizioniVector;
      gnomadIndex(pathGnomadChrIndex,
                  vSubfileNames,
                  vSubfileStarts,
                  vSubfileStops,
                  vSubfilePosizioniVector
                );
      std::vector<std::string> vGnomadSubfilePaths;
      srand((unsigned)time(NULL)); //seeds the pseudo random number generator that rand() uses (http://www.cplusplus.com/forum/beginner/29699/)
      int size0 = 10;
      std::vector<int> v0;
      populateVector(v0, size0);

      //the vector with the file names is converted in file paths and then opened and line counted for each file
       std::vector<std::string> vSubfileNames2;
       for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);
       for ( int subCount = 0; subCount < vSubfileNames2.size(); subCount++ ) {
        std::stringstream streamNomeSubfileGnomad;
        streamNomeSubfileGnomad << pathGnomadChrDir << vSubfileNames2[subCount];
        std::string pathGnomadSubfile = streamNomeSubfileGnomad.str();
        gnomadSubfileAnalysis(pathGnomadSubfile);
      }

}




void gnomadSubfileAnalysis( std::string &nomeFile ) {
  size_t length;
  auto f = map_file(nomeFile.c_str(), length);
  auto l = f + length;
  std::vector<int> v0;
  for (int i=0; i<length; i++) if (f[i] == '\n') v0.push_back(i);
  std::cout << "subfile: " << nomeFile << ", has: " << v0.size() << " rows in: " << length << " bytes." << '\n';
}

void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  ) {
  size_t length;
  auto f = map_file(nomeFileIndex.c_str(), length);
  auto l = f + length;

  std::vector<int> v0;

  v0.push_back(0); 
  for (int i=0; i<length; i++) {
    if (f[i] == '\n') v0.push_back(i+1);
  }
  v0.pop_back();
  for (int nl = 0; nl < v0.size(); nl++) {
    int ncol = 0;
    std::vector<char> vSubfile;
    std::vector<char> vSubStart;
    std::vector<char> vSubStop;
    std::vector<char> vSubPos;
    std::vector<int> vSubfilePosVector;

    for( int start=v0[nl]; start<v0[nl+1]; start++  ) {
      if (f[start] == ',') ncol++;

      if (ncol == 0) {
        if ( f[start] != ',' ) {
          vSubfile.push_back(f[start]);
        }
      }

      if (ncol == 1) {
        if ( f[start] != ',' ) {
          vSubStart.push_back(f[start]);
        }
      }

      if (ncol == 2) {
        if ( f[start] != ',' ) {
          vSubStop.push_back(f[start]);
        }
      }

      if (ncol == 3) {
        if ( f[start] != ',' ) {
          if ( f[start] != ':' ) {
            vSubPos.push_back(f[start]);
          }
          if ( f[start] == ':' || f[start] == '\n' ) {

              std::string subfilePosValue ( vSubPos.begin(), vSubPos.end() );
              vSubPos.clear();

              vSubfilePosVector.push_back( stoi(subfilePosValue) );
              subfilePosValue.erase();
            }
          }
        }
      }

  std::string subFileValue ( vSubfile.begin(), vSubfile.end() );
  vSubfile.clear();
  std::string subfileStartValue ( vSubStart.begin(), vSubStart.end() );
  vSubStart.clear();
  std::string subfileStopValue ( vSubStop.begin(), vSubStop.end() );
  vSubStop.clear();

  vSubfileNames.push_back( subFileValue );
  vSubfileStarts.push_back( stoi( subfileStartValue ) );
  vSubfileStops.push_back( stoi(subfileStopValue) );
  vSubfilePosizioniVector.push_back( vSubfilePosVector );
  }
}

void handle_error(const char* msg) {
    perror(msg);
    exit(255);
}

const char* map_file(const char* fname, size_t& length) {

    int fd = open(fname, O_RDONLY);

    if (fd == -1)
        handle_error("open");

    struct stat sb;

    if (fstat(fd, &sb) == -1)
        handle_error("fstat");

    length = sb.st_size;

    const char* addr = static_cast<const char*>(mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0u));
    if (addr == MAP_FAILED)
        handle_error("mmap");

    return addr;
}

void populateVector( std::vector<int> &vec, int n ) {
  for( int i = 0 ; i < n ; ++i ) vec.push_back( 0 + rand() % (3000 - 1 + 1)); //gli ultimi due num sono upper e lower range limits
  sort( vec.begin(), vec.end() );
}

int generateRandInt( int l0, int u0 ) {
  int n0 = l0 + rand() % (u0 - l0 + 1);
  return n0;
}

根据建议,然后我尝试使用 CIN(v1),此处为代码:

#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
#include <vector>
#include <sstream>


void read_file_cin( std::string &nomeFile );
void read_gnomad_index_cin( std::string &nomeFile, char &separator, std::vector<std::string> &vectorName );
void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );
std::vector<int> sovrapposizioniVectorsInt( std::vector<int> &v0, std::vector<int> &v1 );

int main() {

  std::string pathGnomadChrIndex = "/Volumes/enrico/gnomad/exomes/splitted_5c/ex_1_5c/chr_1/chr_1.txt";
  char separator = ',';
  std::vector<std::string> vSubfileNames;

  read_gnomad_index_cin(pathGnomadChrIndex, separator, vSubfileNames);

  srand((unsigned)time(NULL)); //seeds the pseudo random number generator that rand() uses (http://www.cplusplus.com/forum/beginner/29699/)
  int size0 = 10;
  std::vector<int> v0;
  populateVector(v0, size0);

   std::vector<std::string> vSubfileNames2;
   for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);

   std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c/ex_1_5c/chr_1/";
   for ( auto vi : vSubfileNames2 ) {
     std::cout << vi << '\n';
     std::stringstream streamNomeGnomadChrDir;
     streamNomeGnomadChrDir << gnomadSplitDir << vi;
     std::string pathSubfile = streamNomeGnomadChrDir.str();
     read_file_cin( pathSubfile );
   }
}

void first_column_cin( char &separator, std::vector<std::string> &vectorName) {
    std::string line;
    int lineCount = 0;
    while(std::getline(std::cin, line)) {
      std::string filename;
      std::istringstream iss(line); 
      getline(iss, filename, separator); 
      vectorName.push_back(filename);
      lineCount++;
    }
}

void read_gnomad_index_cin( std::string &nomeFile, char &separator, std::vector<std::string> &vectorName ) {
    std::ifstream in(nomeFile.c_str());
    std::streambuf *cinbuf = std::cin.rdbuf(); 
    std::cin.rdbuf(in.rdbuf());

    first_column_cin( separator, vectorName );

    std::cin.rdbuf(cinbuf); 
}

void file_countline_cin() {
    std::string line;
    int lineCount = 0;
    while(std::getline(std::cin, line)) lineCount++;
    std::cout << "file has: " << lineCount << " rows." << '\n';
}
void read_file_cin( std::string &nomeFile ) {
    std::ifstream in(nomeFile.c_str());
    std::streambuf *cinbuf = std::cin.rdbuf(); //save old buf
    std::cin.rdbuf(in.rdbuf()); //redirect std::cin to in.txt!
    file_countline_cin(); //call function
    std::cin.rdbuf(cinbuf);   //reset to standard input again
}

void populateVector( std::vector<int> &vec, int n ) {
  for( int i = 0 ; i < n ; ++i ) vec.push_back( 0 + rand() % (3000 - 1 + 1)); //gli ultimi due num sono upper e lower range limits
  sort( vec.begin(), vec.end() );
}

int generateRandInt( int l0, int u0 ) {
  int n0 = l0 + rand() % (u0 - l0 + 1);
  return n0;
}

然后我尝试通过以下方法加快CIN(https://www.geeksforgeeks.org/fast-io-for-competitive-programming/)的速度:

std::ios_base::sync_with_stdio(false);
std::cin.tie(NULL);

此处为 CIN.SPEED(v2)版本:

void read_file_cin( std::string &nomeFile );
void read_gnomad_index_cin( std::string &nomeFile, char &separator, std::vector<std::string> &vectorName );


void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );

std::vector<int> sovrapposizioniVectorsInt( std::vector<int> &v0, std::vector<int> &v1 );

int main() {

  std::string pathGnomadChrIndex = "/Volumes/enrico/gnomad/exomes/splitted_5c/ex_1_5c/chr_1/chr_1.txt";
  char separator = ',';
  std::vector<std::string> vSubfileNames;

  read_gnomad_index_cin(pathGnomadChrIndex, separator, vSubfileNames);

  srand((unsigned)time(NULL));
  int size0 = 10;
  std::vector<int> v0;
  populateVector(v0, size0);

   std::vector<std::string> vSubfileNames2;
   for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);

   std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c/ex_1_5c/chr_1/";
   for ( auto vi : vSubfileNames2 ) {
     std::cout << vi << '\n';
     std::stringstream streamNomeGnomadChrDir;
     streamNomeGnomadChrDir << gnomadSplitDir << vi;
     std::string pathSubfile = streamNomeGnomadChrDir.str();
     read_file_cin( pathSubfile );
   }
}

void first_column_cin( char &separator, std::vector<std::string> &vectorName) {
  /* SPEEDUP */
  std::ios_base::sync_with_stdio(false);
  std::cin.tie(NULL);
  /* SPEEDUP */
  std::string line;
    int lineCount = 0;
    while(std::getline(std::cin, line)) {
      std::string filename;
      std::istringstream iss(line); // string stream
      getline(iss, filename, separator); // read first part up to comma, ignore the comma
      vectorName.push_back(filename);
      lineCount++;
    }
}

void read_gnomad_index_cin( std::string &nomeFile, char &separator, std::vector<std::string> &vectorName ) {
  /* SPEEDUP */
  std::ios_base::sync_with_stdio(false);
  std::cin.tie(NULL);
  /* SPEEDUP */
    std::ifstream in(nomeFile.c_str());
    std::streambuf *cinbuf = std::cin.rdbuf(); //save old buf
    std::cin.rdbuf(in.rdbuf()); //redirect std::cin to in.txt!

    first_column_cin( separator, vectorName ); //call function

    std::cin.rdbuf(cinbuf);   //reset to standard input again
}

void file_countline_cin() {
  /* SPEEDUP */
  std::ios_base::sync_with_stdio(false);
  std::cin.tie(NULL);
  /* SPEEDUP */
    std::string line;
    int lineCount = 0;
    while(std::getline(std::cin, line)) lineCount++;
    std::cout << "file has: " << lineCount << " rows." << '\n';
}


void read_file_cin( std::string &nomeFile ) {
  /* SPEEDUP */
  std::ios_base::sync_with_stdio(false);
  std::cin.tie(NULL);
  /* SPEEDUP */
    std::ifstream in(nomeFile.c_str());
    std::streambuf *cinbuf = std::cin.rdbuf(); //save old buf
    std::cin.rdbuf(in.rdbuf()); //redirect std::cin to in.txt!

    file_countline_cin(); //call function
    std::cin.rdbuf(cinbuf);   //reset to standard input again
}

然后我尝试使用mmap的版本,将munmapped()和close()文件推送到向量中的所有文件内容,然后返回向量以分析文件(在此测试中仅对换行符进行计算)内容并关闭mmap和文件。

此处 MMAP.VECTOR(v3):

const char* map_file(const char* fname, size_t& length);

void mmap_file( std::string &filename, std::vector<char> &vFile);

void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  );
void gnomadSubfileAnalysis( std::string &nomeFile );
void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );

int main() {

      std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c/";
      std::string nomeChr = "1";

      std::stringstream streamNomeGnomadChrDir;
      streamNomeGnomadChrDir << gnomadSplitDir << "ex_" << nomeChr << "_5c/" << "chr_" << nomeChr << "/";
      std::string pathGnomadChrDir = streamNomeGnomadChrDir.str();
      std::stringstream streamNomeGnomadChrIndex;
      streamNomeGnomadChrIndex << pathGnomadChrDir << "chr_" << nomeChr << ".txt";
      std::string pathGnomadChrIndex = streamNomeGnomadChrIndex.str();

      std::vector<std::string> vSubfileNames;
      std::vector<int> vSubfileStarts;
      std::vector<int> vSubfileStops;
      std::vector<std::vector<int>> vSubfilePosizioniVector;

      gnomadIndex(pathGnomadChrIndex,
                  vSubfileNames,
                  vSubfileStarts,
                  vSubfileStops,
                  vSubfilePosizioniVector
                );

      std::vector<std::string> vGnomadSubfilePaths;


     srand((unsigned)time(NULL));
      int size0 = 10;
      std::vector<int> v0;
      populateVector(v0, size0);

       std::vector<std::string> vSubfileNames2;
       for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);

      for ( int subCount = 0; subCount < vSubfileNames2.size(); subCount++ ) {

        std::stringstream streamNomeSubfileGnomad;
        streamNomeSubfileGnomad << pathGnomadChrDir << vSubfileNames2[subCount];
        std::string pathGnomadSubfile = streamNomeSubfileGnomad.str();

        gnomadSubfileAnalysis(pathGnomadSubfile);
      }

}

void gnomadSubfileAnalysis( std::string &nomeFile ) {

  std::vector<char> f;
  mmap_file(nomeFile, f);

  std::vector<int> v0;

  for (int i=0; i<f.size(); i++) if (f[i] == '\n') v0.push_back(i);

  std::cout << "subfile: " << nomeFile << ", has: " << v0.size() << " rows in: " << f.size() << " bytes." << '\n';
}

void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  ) {


  std::vector<char> f;
  mmap_file(nomeFileIndex, f);
  std::vector<int> v0;

  v0.push_back(0); //la prima riga mi serve qui
  for (int i=0; i<f.size(); i++) {
    if (f[i] == '\n') v0.push_back(i+1);
  }
  v0.pop_back();

  for (int nl = 0; nl < v0.size(); nl++) {

    int ncol = 0;

    std::vector<char> vSubfile;
    std::vector<char> vSubStart;
    std::vector<char> vSubStop;
    std::vector<char> vSubPos;
    std::vector<int> vSubfilePosVector;

    for( int start=v0[nl]; start<v0[nl+1]; start++  ) {

      if (f[start] == ',') ncol++;

      if (ncol == 0) {
        if ( f[start] != ',' ) {
          vSubfile.push_back(f[start]);
        }
      }

      if (ncol == 1) {
        if ( f[start] != ',' ) {
          vSubStart.push_back(f[start]);
        }
      }

      if (ncol == 2) {
        if ( f[start] != ',' ) {
          vSubStop.push_back(f[start]);
        }
      }

      if (ncol == 3) {
        if ( f[start] != ',' ) {
          if ( f[start] != ':' ) {
            vSubPos.push_back(f[start]);
          }
          if ( f[start] == ':' || f[start] == '\n' ) {

              std::string subfilePosValue ( vSubPos.begin(), vSubPos.end() );
              vSubPos.clear();

              vSubfilePosVector.push_back( stoi(subfilePosValue) );
              subfilePosValue.erase();
            }
          }
        }
      }

  std::string subFileValue ( vSubfile.begin(), vSubfile.end() );
  vSubfile.clear();
  std::string subfileStartValue ( vSubStart.begin(), vSubStart.end() );
  vSubStart.clear();
  std::string subfileStopValue ( vSubStop.begin(), vSubStop.end() );
  vSubStop.clear();

  vSubfileNames.push_back( subFileValue );
  vSubfileStarts.push_back( stoi( subfileStartValue ) );
  vSubfileStops.push_back( stoi(subfileStopValue) );
  vSubfilePosizioniVector.push_back( vSubfilePosVector );
  }
}


void handle_error(const char* msg) {
    perror(msg);
    exit(255);
}

const char* map_file(const char* fname, size_t& length) {

    int fd = open(fname, O_RDONLY);

    if (fd == -1)
        handle_error("open");

    struct stat sb;

    if (fstat(fd, &sb) == -1)
        handle_error("fstat");

    length = sb.st_size;

    const char* addr = static_cast<const char*>(mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0u));
    if (addr == MAP_FAILED)
        handle_error("mmap");

    return addr;
}



void populateVector( std::vector<int> &vec, int n ) {
  for( int i = 0 ; i < n ; ++i ) vec.push_back( 0 + rand() % (3000 - 1 + 1)); //gli ultimi due num sono upper e lower range limits
  sort( vec.begin(), vec.end() );
}

int generateRandInt( int l0, int u0 ) {
  int n0 = l0 + rand() % (u0 - l0 + 1);
  return n0;
}


size_t getFilesize(const char* filename) {
    struct stat st;
    stat(filename, &st);
    return st.st_size;
}


void mmap_file( std::string &filename, std::vector<char> &vFile) {
    size_t filesize = getFilesize(filename.c_str());
    int fd = open(filename.c_str(), O_RDONLY, 0);
    assert(fd != -1);
    void* mmappedData = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE, fd, 0);
    assert(mmappedData != MAP_FAILED);

    const char* f = static_cast<const char*>(mmappedData);

    for ( int i = 0; i < filesize; i++ ) vFile.push_back(f[i]);

    int rc = munmap(mmappedData, filesize);
    assert(rc == 0);
    close(fd);

}

但是我立即看到它变慢了,我认为这可能是由于进一步填充了向量,所以我尝试了一个在mmap打开时将换行符计数的版本,然后将其关闭。

MMAP.CLOSE(v4)

 const char* map_file(const char* fname, size_t& length);

void mmap_file( std::string &filename, std::vector<char> &vFile);
int mmap_file_nlines( std::string &filename );
void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  );
void gnomadSubfileAnalysis( std::string &nomeFile );
void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );

int main() {

      std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c_text/";
      std::string nomeChr = "1";

      std::stringstream streamNomeGnomadChrDir;
      streamNomeGnomadChrDir << gnomadSplitDir << "chr_" << nomeChr << "/";
      std::string pathGnomadChrDir = streamNomeGnomadChrDir.str();
      std::stringstream streamNomeGnomadChrIndex;
      streamNomeGnomadChrIndex << pathGnomadChrDir << "chr_" << nomeChr << ".txt";
      std::string pathGnomadChrIndex = streamNomeGnomadChrIndex.str();

      std::vector<std::string> vSubfileNames;
      std::vector<int> vSubfileStarts;
      std::vector<int> vSubfileStops;
      std::vector<std::vector<int>> vSubfilePosizioniVector;

      gnomadIndex(pathGnomadChrIndex,
                  vSubfileNames,
                  vSubfileStarts,
                  vSubfileStops,
                  vSubfilePosizioniVector
                );

      std::vector<std::string> vGnomadSubfilePaths;


      srand((unsigned)time(NULL));
      int size0 = 10;
      std::vector<int> v0;
      populateVector(v0, size0);
       std::vector<std::string> vSubfileNames2;
       for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);

      for ( int subCount = 0; subCount < vSubfileNames2.size(); subCount++ ) {
        std::stringstream streamNomeSubfileGnomad;
        streamNomeSubfileGnomad << pathGnomadChrDir << vSubfileNames2[subCount];
        std::string pathGnomadSubfile = streamNomeSubfileGnomad.str();
        gnomadSubfileAnalysis(pathGnomadSubfile);
      }

}

void gnomadSubfileAnalysis( std::string &nomeFile ) {
  int nLinee = mmap_file_nlines(nomeFile);

  std::cout << "subfile: " << nomeFile << ", has: " << nLinee << " rows" << '\n';
}

void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  ) {

  size_t length;
  auto f = map_file(nomeFileIndex.c_str(), length);
  auto l = f + length;
  std::vector<int> v0;
  v0.push_back(0); //la prima riga mi serve qui
  for (int i=0; i<length; i++) {
    if (f[i] == '\n') v0.push_back(i+1);
  }
  v0.pop_back();

  for (int nl = 0; nl < v0.size(); nl++) {

    int ncol = 0;
    std::vector<char> vSubfile;
    std::vector<char> vSubStart;
    std::vector<char> vSubStop;
    std::vector<char> vSubPos;
    std::vector<int> vSubfilePosVector;

    for( int start=v0[nl]; start<v0[nl+1]; start++  ) {
      if (f[start] == ',') ncol++;
      if (ncol == 0) {
        if ( f[start] != ',' ) {
          vSubfile.push_back(f[start]);
        }
      }
      if (ncol == 1) {
        if ( f[start] != ',' ) {
          vSubStart.push_back(f[start]);
        }
      }
      if (ncol == 2) {
        if ( f[start] != ',' ) {
          vSubStop.push_back(f[start]);
        }
      }
      if (ncol == 3) {
        if ( f[start] != ',' ) {
          if ( f[start] != ':' ) {
            vSubPos.push_back(f[start]);
          }
          if ( f[start] == ':' || f[start] == '\n' ) {
              std::string subfilePosValue ( vSubPos.begin(), vSubPos.end() );
              vSubPos.clear();
              vSubfilePosVector.push_back( stoi(subfilePosValue) );
              subfilePosValue.erase();
            }
          }
        }
      }

  std::string subFileValue ( vSubfile.begin(), vSubfile.end() );
  vSubfile.clear();
  std::string subfileStartValue ( vSubStart.begin(), vSubStart.end() );
  vSubStart.clear();
  std::string subfileStopValue ( vSubStop.begin(), vSubStop.end() );
  vSubStop.clear();
  vSubfileNames.push_back( subFileValue );
  vSubfileStarts.push_back( stoi( subfileStartValue ) );
  vSubfileStops.push_back( stoi(subfileStopValue) );
  vSubfilePosizioniVector.push_back( vSubfilePosVector );
  }
}

void handle_error(const char* msg) {
    perror(msg);
    exit(255);
}
const char* map_file(const char* fname, size_t& length) {

    int fd = open(fname, O_RDONLY);
    if (fd == -1)
        handle_error("open");
    struct stat sb;
    if (fstat(fd, &sb) == -1)
        handle_error("fstat");
    length = sb.st_size;

    const char* addr = static_cast<const char*>(mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0u));
    if (addr == MAP_FAILED)
        handle_error("mmap");
    return addr;
}

void populateVector( std::vector<int> &vec, int n ) {
  for( int i = 0 ; i < n ; ++i ) vec.push_back( 0 + rand() % (3000 - 1 + 1)); //gli ultimi due num sono upper e lower range limits
  sort( vec.begin(), vec.end() );
}

int generateRandInt( int l0, int u0 ) {
  int n0 = l0 + rand() % (u0 - l0 + 1);
  return n0;
}

size_t getFilesize(const char* filename) {
    struct stat st;
    stat(filename, &st);
    return st.st_size;
}

void mmap_file( std::string &filename, std::vector<char> &vFile) {
    size_t filesize = getFilesize(filename.c_str());
    int fd = open(filename.c_str(), O_RDONLY, 0);
    assert(fd != -1);
    void* mmappedData = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE, fd, 0);
    assert(mmappedData != MAP_FAILED);

    const char* f = static_cast<const char*>(mmappedData);

    for ( int i = 0; i < filesize; i++ ) vFile.push_back(f[i]);
    int rc = munmap(mmappedData, filesize);
    assert(rc == 0);
    close(fd);
}

int mmap_file_nlines( std::string &filename ) {
    size_t filesize = getFilesize(filename.c_str());
    int fd = open(filename.c_str(), O_RDONLY, 0);
    assert(fd != -1);
    void* mmappedData = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE, fd, 0);
    assert(mmappedData != MAP_FAILED);

    const char* f = static_cast<const char*>(mmappedData);
    int lineCount =0 ;
    for ( int i = 0; i < filesize; i++ ) if( f[i] == '\n' ) lineCount++;
    int rc = munmap(mmappedData, filesize);
    assert(rc == 0);
    close(fd);

    return lineCount;

}

以下是每种方法进行1000次尝试的秒数:

          cin 0.983380
    cin.speed 0.989011
   mmap.close 2.863860
    mmap.open 0.915395
  mmap.vector 4.683976

某种程度上,我感到惊讶的是CIN的“加速”没有改变,(也许我出了什么问题?)并且关闭文件对于mmap是如此耗时!

这里是结果图。

enter image description here

真的欢迎更多的专家发表评论!

PS:我的机器是 iMac(Retina 5K,27英寸,2014年末),4 GHz Intel Core i7、16 GB 1600 MHz DDR3

0 个答案:

没有答案