Question

我正在尝试读取大型数据集，按照我需要的方式对其进行格式化，然后将其写入另一个文件。我正在尝试使用C ++而不是SAS或STATA来获得速度优势。数据文件通常大约为10千兆字节。我当前的代码需要花费一个多小时来运行（然后我就把它杀了，因为我确信我的代码效率非常低。

有更有效的方法吗？也许将文件读入内存然后使用switch语句进行分析？（我有32gb ram linux 64bit）。是否有可能读取，然后在循环内写入会减慢速度，因为它不断读取，然后写入？我试图从一个驱动器读取它，然后写入另一个驱动器以试图加快速度。

开关盒的速度是否会降低？

我现在使用getline读取数据，使用switch语句正确解析它，然后将其写入我的outfile。并重复3亿行。在switch语句中还有大约10个案例，但为了简洁起见，我没有复制。

代码在主函数中可能非常难看，但我想在我吸引人的工作之前让它工作。

我尝试过使用read（）但没有成功。如果我需要澄清任何事情，请告诉我。

感谢您的帮助！

 #include <iostream>
 #include <fstream>
 #include <string>
 #include <sstream>
 #include <stdio.h>
 //#include <cstring>
 //#include <boost/algorithm/string.hpp>

 #include <vector>

  using namespace std;
 //using namespace boost;


 struct dataline
{
char type[0];
double second;
short mill;
char event[1];
char ticker[6];
char marketCategory[1];
char financialStatus[1];
int roundLotSize;
short roundLotOnly;
char tradingState[1];
char reserved[1];
char reason[4];
char mpid[4];
char primaryMarketMaker[1];
char primaryMarketMode[1];
char marketParticipantState[1];
unsigned long orderNumber;
char buySell[0];
double shares;
float price;
int executedShares;
double matchNumber;
char printable[1];
double executionPrice;
int canceledShares;
double sharesBig;
double crossPrice;
char crossType[0];
double pairedShares;
double imbalanceShares;
char imbalanceDirection[1];
double fairPrice;
double nearPrice;
double currentReferencePrice;
char priceVariationIndicator[1];
};

  int main () 
{
string a; 
string b;
string c;
string d;
string e;
string f;
string g;
string h;
string k;
string l;
string times;
string smalltimes;

short time;     //counter to keep second filled
short smalltime;    //counter to keep millisecond filled
double N;
double NN;
double NNN;
int length;
char M; 
//vector<> fout;
string line;

ofstream fout ("/media/3tb/test.txt");
ifstream myfile;
myfile.open("S050508-v3.txt");

dataline oneline;

if (myfile.is_open())
    {
    while ( myfile.good() )
        {
        getline (myfile,line);
//      cout << line<<endl;;

        a=line.substr(0,1);
        stringstream ss(a);
        char type;
        ss>>type;


        switch (type)
            { 
            case 'T':
                {
                if (type == 'T')
                    {
                    times=line.substr(1,5);
                    stringstream s(times);
                    s>>time;
                    //oneline.second=time;
                    //oneline.second;
                    //cout<<time<<endl;
                    }
                else
                    {
                    time=time;
                    }
                break;
                }
            case 'M':
                {
                if (type == 'M')
                    {
                    smalltimes=line.substr(1,3);
                    stringstream ss(smalltimes);
                    ss>>smalltime;      //oneline.mill;
                //  cout<<smalltime<<endl;                            //smalltime=oneline.mill;
                    }
                else
                    {
                    smalltime=smalltime;
                    }
                break;
                }


            case 'R':
                {
                oneline.second=time;
                oneline.mill=smalltime;

                a=line.substr(0,1);
                stringstream ss(a);
                ss>>oneline.type;

                b=line.substr(1,6);
                stringstream sss(b);
                sss>>oneline.ticker;

                c=line.substr(7,1);
                stringstream ssss(c);
                ssss>>oneline.marketCategory;

                d=line.substr(8,1);
                stringstream sssss(d);
                sssss>>oneline.financialStatus;

                e=line.substr(9,6);
                stringstream ssssss(e);
                ssssss>>oneline.roundLotSize;

                f=line.substr(15,1);
                stringstream sssssss(f);
                sssssss>>oneline.roundLotOnly;

                *oneline.tradingState=0;
                *oneline.reserved=0;
                *oneline.reason=0;
                *oneline.mpid=0;
                *oneline.primaryMarketMaker=0;
                *oneline.primaryMarketMode=0;
                *oneline.marketParticipantState=0;
                oneline.orderNumber=0;
                *oneline.buySell=0;
                oneline.shares=0;
                oneline.price=0;
                oneline.executedShares=0;
                oneline.matchNumber=0;
                *oneline.printable=0;
                oneline.executionPrice=0;
                oneline.canceledShares=0;
                oneline.sharesBig=0;
                oneline.crossPrice=0;
                *oneline.crossType=0;
                oneline.pairedShares=0;
                oneline.imbalanceShares=0;
                *oneline.imbalanceDirection=0;
                oneline.fairPrice=0;
                oneline.nearPrice=0;
                oneline.currentReferencePrice=0;
                *oneline.priceVariationIndicator=0;

                break;
                }//End Case 
            }//End Switch
            }//end While
    myfile.close();

     }//End If
else cout << "Unable to open file"; 
cout<<"Junk"<<endl;

return 0;
}

UPDATE 所以我一直在尝试使用内存映射，但现在我遇到了分段错误。我一直试图按照不同的例子来拼凑一些对我有用的东西。为什么我会遇到分段错误？我已经采用了我的代码的第一部分，如下所示：

int main (int argc, char** path) 
 {
 long i;
 int fd;
 char *map;
 char *FILEPATH = path;
 unsigned long FILESIZE;
 FILE* fp = fopen(FILEPATH, "/home/brian/Desktop/S050508-v3.txt");
 fseek(fp, 0, SEEK_END);
 FILESIZE = ftell(fp);
 fseek(fp, 0, SEEK_SET);
 fclose(fp);
 fd = open(FILEPATH, O_RDONLY);

 map = (char *) mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);

 char z;
 stringstream ss;

 for (long i = 0; i <= FILESIZE; ++i) 
    {
    z = map[i];
    if (z != '\n') 
        {
        ss << z;
            }
    else 
        {
            // c style tokenizing
            ss.str("");
            }
        }
 if (munmap(map, FILESIZE) == -1) perror("Error un-mmapping the file");
 close(fd);

Answer 1

数据文件通常大约为10千兆字节。 ... 开关盒会减慢它吗？

几乎肯定不会，闻起来就像是I / O束缚。但你应该考虑测量它。现代CPU具有性能计数器，使用正确的工具很容易利用它们。但是让我们开始将问题划分为一些主要的域：设备的I / O，加载/存储到内存，CPU。您可以在代码中放置一些标记来读取时钟，以便了解每个操作所花费的时间。在linux上，您可以使用clock_gettime()或rdtsc指令访问精度高于操作系统时钟的时钟。

考虑mmap / CreateFileMapping，其中任何一个都可能为您正在访问的网页提供更高的效率/吞吐量。

如果通过大量已经被分页的数据进行流式传输，请考虑大/大页面。

来自manual for mmap()：

描述

mmap（）在虚拟地址空间中创建一个新映射   呼叫过程。指定新映射的起始地址   在addr。 length参数指定映射的长度。

这是mmap() example：

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

#define FILEPATH "/tmp/mmapped.bin"
#define NUMINTS  (1000)
#define FILESIZE (NUMINTS * sizeof(int))

int main(int argc, char *argv[])
{
    int i;
    int fd;
    int *map;  /* mmapped array of int's */

    fd = open(FILEPATH, O_RDONLY);
    if (fd == -1) {
    perror("Error opening file for reading");
    exit(EXIT_FAILURE);
    }

    map = mmap(0, FILESIZE, PROT_READ, MAP_SHARED, fd, 0);
    if (map == MAP_FAILED) {
    close(fd);
    perror("Error mmapping the file");
    exit(EXIT_FAILURE);
    }

    /* Read the file int-by-int from the mmap
     */
    for (i = 1; i <=NUMINTS; ++i) {
    printf("%d: %d\n", i, map[i]);
    }

    if (munmap(map, FILESIZE) == -1) {
    perror("Error un-mmapping the file");
    }
    close(fd);
    return 0;
}

C ++读取大数据，解析，然后写入数据

1 个答案: