数组递增元素与无序映射的效率

时间:2019-07-12 14:10:14

标签: c++ hashtable

我正在为c ++中的哈希表创建一个类,以通过基于字符串中字符的ASCII码进行哈希处理来满足DNA序列。在测试期间,我发现使用类可以比使用标准unordered_map更快地访问元素,但是当尝试增加值(用于计数)时,无序映射的速度大约快4倍。怎么可能呢?

我已经测试了主要速度变化在哪里,并且似乎正在重新分配。当简单地在当前值上加1而不更改值时,它的运行速度比unordered_map要快,但是使用++(尝试递增和递减)时,它要比无序映射花费的时间更长。仅在使用g ++ -O3进行编译时才适用,这对于该项目是必需的。

// K_Mer_Table

#include <vector>
#include <string>
#include <cmath>
#include <iostream>
#include <unordered_map>
#include <chrono>
#include <cstdlib>
#include <fstream>

using namespace std;

void getAllKLength(vector<string> set, vector<vector<string>> &vect, int k,int n);
void getAllKLengthRec(vector<string> set, vector<vector<string>> &vect, vector<string> prefix, int n, int k);
vector<string> collapse(vector<vector<string>> &vect);


int Khash(string seq, int order, int minSize){
    unsigned int index = 0;
    int ls = order+1;
    for (int k = 0; k < ls; k++)
        index += seq[k]*pow(4,k);

    return index - minSize;
}

class K_Table{

    public:

        // will have to make this vector<double> for probs (if its even necessary)
        //vector<int> table;
        int* table;
        unordered_map<char,int> bases;
        int order;
        vector<int> indecies;
        int size; // number of actual datapoints
        unsigned int minSize;

        K_Table(int order,vector<string> kmers){


            bases['A'] = 0;
            bases['T'] = 1;
            bases['C'] = 2;
            bases['G'] = 3;

            this->order = order;
            minSize = Khash("AAAAAA", 5, 0); // Hardcoded 5th order
            size = (((pow(4, order + 2) - 1)/3) - 1);
            table = (int*)malloc((size*(84-65)/3)*sizeof(int));
            indecies.resize(size);

            int index;
            int ls = kmers.size();
            for (int k = 0; k < ls; k++){
                index = Khash(kmers[k], order, minSize);
                table[index] = 0;
                indecies[k] = index;
            }

        }

        int Seq_Count(string seq){
            return table[Khash(seq, this->order, this->minSize)];
        }

        void Increment(string seq){
            table[Khash(seq, this->order, this->minSize)]++;
        }

        void Reset_Table(){
            int ls = indecies.size();
            for (int k = 0; k < ls; k++){
                table[indecies[k]] = 0;
            }
        }


};

//==========================================================================================================================

int main(){


    // give a file containing random combinations of A,T,C, and G
    // each EXACTLY 6 characters    
    // several hundred thousand if possible.

    string randseqfile = "filename.txt";
    ifstream in;
    in.open(randseqfile);

    vector<K_Table> tables;
    vector<string> bases;
    bases.push_back("A");
    bases.push_back("T");
    bases.push_back("C");
    bases.push_back("G");

    int order = 5;

    vector<vector<string>> kmersv;
    getAllKLength(bases, kmersv, order+1, 4);
    vector<string> kmers = collapse(kmersv);
    K_Table ktable(order,kmers);

    // Build Hash Table
    unordered_map<string,int> counts_h;
    for (int k = 0; k < kmers.size(); k++)
        counts_h[kmers[k]] = 0;

    string seq;
    int a;

    chrono::system_clock::time_point start;
    chrono::system_clock::time_point end;
    chrono::duration<double> elapsed_seconds;

    start = chrono::system_clock::now();
    while (getline(in,seq)){
        //a = counts_h[seq];
        counts_h[seq]++;
    }

    end = chrono::system_clock::now();
    elapsed_seconds = end - start;

    cout << "Hash Table took " << elapsed_seconds.count() << "seconds" << endl;

    in.clear();
    in.seekg(0, ios::beg);

    unsigned int minSize = Khash("AAAAAA", 5, 0);
    int s = seq.size();
    start = chrono::system_clock::now();
    while (getline(in,seq)){
        //a = ktable.Seq_Count(seq);
        ktable.Increment(seq);
    }
    end = chrono::system_clock::now();
    elapsed_seconds = end - start;

    cout << "K_Table took " << elapsed_seconds.count() << "seconds" << endl;

    in.close();

    ktable.Reset_Table();

    return 0;
}

//==========================================================================================================================

vector<string> collapse(vector<vector<string>> &vect){
    vector<string> ret;
    vector<string> inner;
    string temp;
    int ls = vect.size();
    int ls2;
    for (unsigned int i=0; i < ls; i++){
        temp = "";
        inner = vect[i];
        ls2 = inner.size();
        for (unsigned int k=0; k < ls2; k++){
            temp += inner[k];
        }
        ret.push_back(temp);
    }
    return ret;
}

void getAllKLengthRec(vector<string> set, vector<vector<string>> &vect, vector<string> prefix, int n, int k){

    //Base case: k is 0,
    //print prefix
    if (k == 0) {
    vect.push_back(prefix);
        return;
    }

    //One by one add all characters
    //from set and recursively
    //call for k equals to k-1
    for (int i = 0; i < n; i++) {
        vector<string> newPrefix;
        //Next character of input added
        newPrefix = prefix;
    newPrefix.push_back(set[i]);
    //prefix.push_back(set[i]);

        //k is decreased, because
        //we have added a new character
        getAllKLengthRec(set, vect, newPrefix, n, k - 1);
    }

}

//Method to generate permutations, driver of getAllKLengthRec
void getAllKLength(vector<string> set, vector<vector<string>> &vect, int k,int n){
    vector<string> temp;
    getAllKLengthRec(set, vect, temp, n, k);
}

我希望类K_Table的增长比unordered_map快,因为访问更快,unordered_map的增长快得多。我无法使用简单数组重现此问题而不将它们包装在一个类中,但是我找不到该类如何减慢速度。

编辑:我想避免使用需要C ++ 11的选项,因为这将是一个可以与任何版本一起使用的工具。 链接到数据文件:https://www.dropbox.com/s/p6x68w9k56w5qkp/seqran.txt?dl=0

0 个答案:

没有答案