Question

我有一个大文件，超过8900万行。我想读取一个文件，将其pash到哈希表，然后进行一些计算。

问题在于使用istream读取文件并将其传递到哈希表太慢了。

是否有可能使用更多线程来读取文件？有线程库吗？

还是我应该将文件切成小块，然后为每块使用一个线程？

哈希函数的计算时间不长。

对于碰撞，我使用列表。该表的数量是100万。

       // Adding_date_too_file.cpp : This file contains the 'main' function. Program execution begins and ends there.
    //

    #include "pch.h"
    #include <iostream>
    #include <string>
    #include "hash.h"
    #include <iostream>
    #include <fstream>

    using namespace std;
    int main()
    {

        hasho Hashy;
        string f1, f2, f3, f4, f5, f6, f7;
        bool is_first_line = true;
        fstream file_input;
        fstream  file_2_Collums;

        cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only two column\n which is going to be used for searching based on that file)" << flush;
        while (true)
        {

            string infilename;
            getline(cin, infilename);
            file_input.open(infilename.c_str());
            if (file_input)break;
            cout << "Invalid file. Please enter a valid input file name> " << flush;
        }



        cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only one column )" << flush;
        while (true)
        {

            string infilename;
            getline(cin, infilename);
            file_2_Collums.open(infilename.c_str());
            if (file_2_Collums)break;
            cout << "Invalid file. Please enter a valid input file name> " << flush;
        }
        //creating output file


        int * table;
        table = new int[2];

        int count_file_lines = 0;
        int line_counter_inventors = 0;

        if (file_input.is_open())
        {

            while (!file_input.eof())
            {
                if (is_first_line == true) {
                    getline(file_input, f1, '\n');
                    is_first_line = false;
                }


                getline(file_input, f1, '\t');// patent id

                getline(file_input, f2, '\t');// patent id

                getline(file_input, f3, '\t');// patent id

                getline(file_input, f3, '\t');// patent id

                getline(file_input, f6, '\t');// patent id
                getline(file_input, f3, '\n');//date


                //cout << "adding these items " << f1 << '\t' << f6 << endl;

                Hashy.AddItem(f2, f6);
            cout << count_file_lines << endl;
                count_file_lines++;
            //  cout << f2 << '\t' << f6 << endl;
            }

        }

        int lines_2 = 0;

            if (file_2_Collums.is_open())
            {
                Hashy.openOutputFile();
                while (!file_2_Collums.eof())
                {

                    getline(file_2_Collums, f1, '\t');//patent_id

                    getline(file_2_Collums, f4, '\n');//assignee_id
                    //cout << f1 << endl;


                    Hashy.FindDateId(f1, f4);

                    lines_2++;
                }

            }





    system("pause");
    return 0;}

Hash.cpp

  #include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"

#include "hash.h"
#include <fstream>
using namespace std;
static ofstream output_file;

hasho::hasho()
{
    for (int i = 0; i < tableSize; i++) {

        //cout << i << endl;

        HashTable[i] = new item;
        HashTable[i]->pattent_id = "empty";

        HashTable[i]->date = "empty";
        HashTable[i]->next = NULL;
    }

}

void hasho::openOutputFile() {

    cout << "Please give the name of the output file: \n(The file should end with the format type (txt,csv etc.)) " << flush;
    while (true)
    {

        string infilename;
        getline(cin, infilename);
        output_file.open(infilename.c_str(), fstream::out);
        break;
    }
}

int hasho::NumberOfItemsInIndex(int index) {

    int count = 0;
    if (HashTable[index]->date == "empty") {

        return count;
    }
    else {

        count++;
        item* ptr = HashTable[index];
        while (ptr->next != NULL) {

            count++;
            ptr = ptr->next;

        }
    }

    return count;

}

void hasho::PrintTable() {

    int number;

    for (int i = 0; i < tableSize; i++) {


        number = NumberOfItemsInIndex(i);
        cout << "---------------------------------------\n";
        cout << "index= " << i << endl;

        cout << HashTable[i]->pattent_id << endl;
        cout << HashTable[i]->date << endl;
        cout << "# of items= " << number << endl;
        cout << "---------------------------------------\n";
    }

}


void hasho::PrintItemsInIndex(int index) {


    item* ptr = HashTable[index];

    if (ptr->date == "empty") {
        cout << "index  = " << index << " is empty." << endl;
    }
    else {
        cout << "index = " << index << " contains the following items\n";
        while (ptr != NULL) {



            cout << "-----------" << endl;
            cout << ptr->date << endl;
            cout << ptr->pattent_id << endl;
            cout << "-----------" << endl;

            ptr = ptr->next;

        }
    }

}



void hasho::AddItem(string pattend_id, string date)
{
    int index = Hash(pattend_id);

    if (HashTable[index]->pattent_id == "empty")
    {
        HashTable[index]->pattent_id = pattend_id;
        HashTable[index]->date = date;
    }
    else {

        item* ptr = HashTable[index];
        item* n = new item;
        n->date = date ;
        n->pattent_id = pattend_id;
        n->next = NULL;

        while (ptr->next != NULL) {
            ptr = ptr->next;
        }
        ptr->next = n;

    }
}

void hasho::FindDateId(string pattend_id, string assignee_id1) {

    int found = 0;
    int nfound = 0;

    int index = Hash(pattend_id);
    bool foundDateId = false;
    string date;
    item* ptr = HashTable[index];
    int count = 1;
    //write to file

    //cout << "WE are looking for the date of " <<pattend_id << " in Index:  " << index <<endl;
    while (ptr != NULL) {
        //cout << "NOw we are looking with : " << pattend_id << endl;
        if (ptr->pattent_id == pattend_id) {

            //cout << "NOw we are looking with : " << pattend_id <<endl;


            foundDateId = true;

            date = ptr->date;
            //write to file 


            output_file << pattend_id << "\t";
            output_file << assignee_id1 << endl;
            output_file << date << "\t";
            //cout << " " << date << endl;
            found = 1;
            count++;
        }
        ptr = ptr->next;
    }
    if (foundDateId == false) {
        nfound++;


    }
    cout << "found " << found << endl;
    cout << "not found " << nfound << endl;
    cout << endl;

}

int hasho::Hash(string key)
{
    int unsigned hash = 0;
    int  unsigned index;

    //cout << key << endl;

    for (int unsigned i = 0; i < key.length(); i++) {

        hash = hash + (int)key[i] *(i+1);

    }

    index =hash % tableSize;
    //cout << index << endl;
    return index;


}

Hash.h

#pragma once

#include "pch.h"
#include <iostream>
#include <string>
//#include "hash.cpp"
using namespace std;

#pragma comment(linker, "/STACK:7000000")
#pragma comment(linker, "/HEAP:7000000")

#ifndef  HASH_H
#define HASH_H


class hasho {
private:
    static const int tableSize = 300003;

    struct item {
        string pattent_id;
        string date;
        item* next;
    };

    item* HashTable[tableSize];



public:
    hasho();
    int Hash(string key);
    void AddItem(string pattend_id, string date);
    int NumberOfItemsInIndex(int index);
    void PrintTable();
    void PrintItemsInIndex(int index);
    void FindDateId(string pattent_id, string assgnee_id);
    void openOutputFile();
};


#endif // ! HASH_H

Answer 1

我的大文件超过8900万行

如果您考虑使用多个线程来处理它，则可能不应该这样做。您应该解释该大文件包含的内容（什么样的数据：基因组学，时间序列等）及其大小（以GB为单位）。您是一次还是多次处理同一个文件？您需要处理多少时间（以time(1)为单位）？ wc(1)需要多少时间来计算行数？

可能是将文件拆分为几个较小的文件（例如，使用split(1)）由 entire 行组成，并使用较小的文件填充程序。我不知道这是否对您有帮助（可能不会，除非您对读取这些文件的程序进行几次运行

）。

另一种可能性是对文件进行两次传递。第一遍将对行进行计数，也许还记得其中一些行的起始偏移量（例如，每1024行）。然后，您可以在第二遍中并行处理文件（通过重用记住的偏移量）。

顺便说一句，如果您的大文件太大而无法保留在page cache中，则您的问题是IO受限（瓶颈是物理磁盘硬件），并且您将无法获得任何速度尝试对其进行并行化（甚至将其拆分为较小的文件）。

一种可能是读取（并缓慢地）解析您的大文件，并可能用其数据填充某个数据库（也许是sqlite）。然后（如果您要多次处理该数据）可能会利用访问该数据库（而不是该文件）的优势。

关于哈希表，请考虑改用标准C ++ containers（例如std::unordered_map）。

PS。我们不知道那个大文件是什么，它包含什么以及如何处理它。

C ++读取文件多个线程

1 个答案: