我编写了一个计算密集型程序,并用cachegrind
对其进行了分析,结果显示数据缓存未命中可能是主要的瓶颈。我调整(将计算单元的大小减小到CPU高速缓存大小)我的程序的缓冲区大小和缓存未命中减少一半。但仍然只有5%左右,还有什么我可以优化的吗?
调整前
==2729==
==2729== I refs: 20,104,725,617
==2729== I1 misses: 1,899
==2729== LLi misses: 1,835
==2729== I1 miss rate: 0.00%
==2729== LLi miss rate: 0.00%
==2729==
==2729== D refs: 6,754,924,837 (3,968,410,253 rd + 2,786,514,584 wr)
==2729== D1 misses: 697,140,469 ( 549,560,569 rd + 147,579,900 wr)
==2729== LLd misses: 74,594,136 ( 74,241,144 rd + 352,992 wr)
==2729== D1 miss rate: 10.3% ( 13.8% + 5.2% )
==2729== LLd miss rate: 1.1% ( 1.8% + 0.0% )
==2729==
==2729== LL refs: 697,142,368 ( 549,562,468 rd + 147,579,900 wr)
==2729== LL misses: 74,595,971 ( 74,242,979 rd + 352,992 wr)
==2729== LL miss rate: 0.2% ( 0.3% + 0.0% )
在
==6996== I refs: 645,316,413
==6996== I1 misses: 1,884
==6996== LLi misses: 1,628
==6996== I1 miss rate: 0.00%
==6996== LLi miss rate: 0.00%
==6996==
==6996== D refs: 215,556,739 (127,281,049 rd + 88,275,690 wr)
==6996== D1 misses: 9,460,159 ( 6,718,647 rd + 2,741,512 wr)
==6996== LLd misses: 20,887 ( 6,607 rd + 14,280 wr)
==6996== D1 miss rate: 4.3% ( 5.2% + 3.1% )
==6996== LLd miss rate: 0.0% ( 0.0% + 0.0% )
==6996==
==6996== LL refs: 9,462,043 ( 6,720,531 rd + 2,741,512 wr)
==6996== LL misses: 22,515 ( 8,235 rd + 14,280 wr)
==6996== LL miss rate: 0.0% ( 0.0% + 0.0% )
附带了Mysticial建议的最耗时的功能。
附: galois_region_xor
XOR两个内存区域galois_w08_region_multby_2_64
做了类似的事情,它们都非常耗时但实际上已经过优化。
void shift_coding( const GMatrixU8& mat,
const int w,
unsigned char * const out_buff,
unsigned char * const in_buff,
const unsigned long& size_in_buff ){
unsigned int val;
unsigned int mask_;
unsigned int mask;
unsigned char * pcom;
int k = mat.cc;
int m = mat.rr;
unsigned long size_comp_buff = size_in_buff/k;
bool start_2;
unsigned char * psrc;
unsigned char * pdes;
pcom = (unsigned char *)malloc(size_comp_buff);
memset(out_buff , 0 , size_in_buff*m/k);
mask_ = 1<<(w-1);
for(int j = 0 ; j < k ; ++j){
psrc = in_buff + j*size_comp_buff;
for(int i = 0 ; i < m ; ++i){
pdes = out_buff + i*size_comp_buff;
val = mat.ele[i*k + j];
memset(pcom, 0, size_comp_buff);
start_2 = false;
if(0 == val){continue;}
if(1 == val){galois_region_xor(psrc, pdes, pdes, size_comp_buff); continue;}
for(mask = mask_ ; 0 < mask; mask >>=1){
if(mask & 1){
if(val & 1){
galois_region_xor(psrc , pcom , pcom , size_comp_buff);
}
continue;
}
if(0 != (val & mask)){
start_2 = true;
galois_region_xor(psrc , pcom , pcom , size_comp_buff);
galois_w08_region_multby_2_64(pcom , size_comp_buff);
}else{
if(start_2){
galois_w08_region_multby_2_64(pcom , size_comp_buff);
}
}
}
galois_region_xor(pcom , pdes , pdes , size_comp_buff);
}
}
free(pcom);
}