我正在为c ++中的哈希表创建一个类,以通过基于字符串中字符的ASCII码进行哈希处理来满足DNA序列。在测试期间,我发现使用类可以比使用标准unordered_map更快地访问元素,但是当尝试增加值(用于计数)时,无序映射的速度大约快4倍。怎么可能呢?
我已经测试了主要速度变化在哪里,并且似乎正在重新分配。当简单地在当前值上加1而不更改值时,它的运行速度比unordered_map要快,但是使用++(尝试递增和递减)时,它要比无序映射花费的时间更长。仅在使用g ++ -O3进行编译时才适用,这对于该项目是必需的。
// K_Mer_Table
#include <vector>
#include <string>
#include <cmath>
#include <iostream>
#include <unordered_map>
#include <chrono>
#include <cstdlib>
#include <fstream>
using namespace std;
void getAllKLength(vector<string> set, vector<vector<string>> &vect, int k,int n);
void getAllKLengthRec(vector<string> set, vector<vector<string>> &vect, vector<string> prefix, int n, int k);
vector<string> collapse(vector<vector<string>> &vect);
int Khash(string seq, int order, int minSize){
unsigned int index = 0;
int ls = order+1;
for (int k = 0; k < ls; k++)
index += seq[k]*pow(4,k);
return index - minSize;
}
class K_Table{
public:
// will have to make this vector<double> for probs (if its even necessary)
//vector<int> table;
int* table;
unordered_map<char,int> bases;
int order;
vector<int> indecies;
int size; // number of actual datapoints
unsigned int minSize;
K_Table(int order,vector<string> kmers){
bases['A'] = 0;
bases['T'] = 1;
bases['C'] = 2;
bases['G'] = 3;
this->order = order;
minSize = Khash("AAAAAA", 5, 0); // Hardcoded 5th order
size = (((pow(4, order + 2) - 1)/3) - 1);
table = (int*)malloc((size*(84-65)/3)*sizeof(int));
indecies.resize(size);
int index;
int ls = kmers.size();
for (int k = 0; k < ls; k++){
index = Khash(kmers[k], order, minSize);
table[index] = 0;
indecies[k] = index;
}
}
int Seq_Count(string seq){
return table[Khash(seq, this->order, this->minSize)];
}
void Increment(string seq){
table[Khash(seq, this->order, this->minSize)]++;
}
void Reset_Table(){
int ls = indecies.size();
for (int k = 0; k < ls; k++){
table[indecies[k]] = 0;
}
}
};
//==========================================================================================================================
int main(){
// give a file containing random combinations of A,T,C, and G
// each EXACTLY 6 characters
// several hundred thousand if possible.
string randseqfile = "filename.txt";
ifstream in;
in.open(randseqfile);
vector<K_Table> tables;
vector<string> bases;
bases.push_back("A");
bases.push_back("T");
bases.push_back("C");
bases.push_back("G");
int order = 5;
vector<vector<string>> kmersv;
getAllKLength(bases, kmersv, order+1, 4);
vector<string> kmers = collapse(kmersv);
K_Table ktable(order,kmers);
// Build Hash Table
unordered_map<string,int> counts_h;
for (int k = 0; k < kmers.size(); k++)
counts_h[kmers[k]] = 0;
string seq;
int a;
chrono::system_clock::time_point start;
chrono::system_clock::time_point end;
chrono::duration<double> elapsed_seconds;
start = chrono::system_clock::now();
while (getline(in,seq)){
//a = counts_h[seq];
counts_h[seq]++;
}
end = chrono::system_clock::now();
elapsed_seconds = end - start;
cout << "Hash Table took " << elapsed_seconds.count() << "seconds" << endl;
in.clear();
in.seekg(0, ios::beg);
unsigned int minSize = Khash("AAAAAA", 5, 0);
int s = seq.size();
start = chrono::system_clock::now();
while (getline(in,seq)){
//a = ktable.Seq_Count(seq);
ktable.Increment(seq);
}
end = chrono::system_clock::now();
elapsed_seconds = end - start;
cout << "K_Table took " << elapsed_seconds.count() << "seconds" << endl;
in.close();
ktable.Reset_Table();
return 0;
}
//==========================================================================================================================
vector<string> collapse(vector<vector<string>> &vect){
vector<string> ret;
vector<string> inner;
string temp;
int ls = vect.size();
int ls2;
for (unsigned int i=0; i < ls; i++){
temp = "";
inner = vect[i];
ls2 = inner.size();
for (unsigned int k=0; k < ls2; k++){
temp += inner[k];
}
ret.push_back(temp);
}
return ret;
}
void getAllKLengthRec(vector<string> set, vector<vector<string>> &vect, vector<string> prefix, int n, int k){
//Base case: k is 0,
//print prefix
if (k == 0) {
vect.push_back(prefix);
return;
}
//One by one add all characters
//from set and recursively
//call for k equals to k-1
for (int i = 0; i < n; i++) {
vector<string> newPrefix;
//Next character of input added
newPrefix = prefix;
newPrefix.push_back(set[i]);
//prefix.push_back(set[i]);
//k is decreased, because
//we have added a new character
getAllKLengthRec(set, vect, newPrefix, n, k - 1);
}
}
//Method to generate permutations, driver of getAllKLengthRec
void getAllKLength(vector<string> set, vector<vector<string>> &vect, int k,int n){
vector<string> temp;
getAllKLengthRec(set, vect, temp, n, k);
}
我希望类K_Table的增长比unordered_map快,因为访问更快,unordered_map的增长快得多。我无法使用简单数组重现此问题而不将它们包装在一个类中,但是我找不到该类如何减慢速度。
编辑:我想避免使用需要C ++ 11的选项,因为这将是一个可以与任何版本一起使用的工具。 链接到数据文件:https://www.dropbox.com/s/p6x68w9k56w5qkp/seqran.txt?dl=0