我修改了一个开源项目(粉末玩具)并始终使用/ O2(最大化速度)选项进行编译,启用了SSE2代码生成并且只检查了这个:
void membwand(void * destv, void * srcv, size_t destsize, size_t srcsize)
{
size_t i;
unsigned char * dest = (unsigned char*)destv;
unsigned char * src = (unsigned char*)srcv;
for(i = 0; i < destsize; i++){
dest[i] = dest[i] & src[i%srcsize];
}
}
这里我替换了下面的行
membwand(gravy, gravmask, size_dst, size_src);
membwand(gravx, gravmask, size_dst, size_src);
// gravy, gravx and gravmask are 1MB each
与
membwand2(gravy,gravx, gravmask, size_dst, size_src);
// sizes are same, why not put all in a single function?
实现为:
void membwand2(void * destv1, void * destv2,void * srcv, size_t destsize, size_t srcsize)
{
if(destsize!=srcsize)
{
size_t i;
unsigned char * dest1 = (unsigned char*)destv1;
unsigned char * dest2 = (unsigned char*)destv2;
unsigned char * src = (unsigned char*)srcv;
for(i = 0; i < destsize; i++)
{
dest1[i] = dest1[i] & src[i%srcsize];
dest2[i] = dest2[i] & src[i%srcsize];
}
}
else
{
size_t i;
unsigned char * dest1 = (unsigned char*)destv1;unsigned char * dest2 = (unsigned char*)destv2;
unsigned char * src = (unsigned char*)srcv;
unsigned char tmpChar0=0;unsigned char tmpChar1=0;unsigned char tmpChar2=0;
unsigned char tmpChar3=0;unsigned char tmpChar4=0;unsigned char tmpChar5=0;
unsigned char tmpChar6=0;unsigned char tmpChar7=0;unsigned char tmpChar8=0;
unsigned char tmpChar9=0;unsigned char tmpChar10=0;unsigned char tmpChar11=0;
unsigned char tmpChar12=0;unsigned char tmpChar13=0;unsigned char tmpChar14=0;
unsigned char tmpChar15=0;unsigned char tmpChar16=0;unsigned char tmpChar17=0;
unsigned char tmpChar18=0;unsigned char tmpChar19=0;unsigned char tmpChar20=0;
unsigned char tmpChar21=0;unsigned char tmpChar22=0;unsigned char tmpChar23=0;
unsigned char tmpChar24=0;unsigned char tmpChar25=0;unsigned char tmpChar26=0;
unsigned char tmpChar27=0;unsigned char tmpChar28=0;unsigned char tmpChar29=0;
unsigned char tmpChar30=0;unsigned char tmpChar31=0;
for(i = 0; i < destsize; i+=32)
{
tmpChar0=src[i];tmpChar1=src[i+1];tmpChar2=src[i+2];tmpChar3=src[i+3];
tmpChar4=src[i+4];tmpChar5=src[i+5];tmpChar6=src[i+6];tmpChar7=src[i+7];
tmpChar8=src[i+8];tmpChar9=src[i+9];tmpChar10=src[i+10];tmpChar11=src[i+11];
tmpChar12=src[i+12];tmpChar13=src[i+13];tmpChar14=src[i+14];tmpChar15=src[i+15];
tmpChar16=src[i+16];tmpChar17=src[i+17];tmpChar18=src[i+18];tmpChar19=src[i+19];
tmpChar20=src[i+20];tmpChar21=src[i+21];tmpChar22=src[i+22];tmpChar23=src[i+23];
tmpChar24=src[i+24];tmpChar25=src[i+25];tmpChar26=src[i+26];tmpChar27=src[i+27];
tmpChar28=src[i+28];tmpChar29=src[i+29];tmpChar30=src[i+30];tmpChar31=src[i+31];
dest1[i] = dest1[i] & tmpChar0;
dest1[i+1] = dest1[i+1] & tmpChar1;
dest1[i+2] = dest1[i+2] & tmpChar2;
dest1[i+3] = dest1[i+3] & tmpChar3;
dest1[i+4] = dest1[i+4] & tmpChar4;
dest1[i+5] = dest1[i+5] & tmpChar5;
dest1[i+6] = dest1[i+6] & tmpChar6;
dest1[i+7] = dest1[i+7] & tmpChar7;
dest1[i+8] = dest1[i+8] & tmpChar8;
dest1[i+9] = dest1[i+9] & tmpChar9;
dest1[i+10] = dest1[i+10] & tmpChar10;
dest1[i+11] = dest1[i+11] & tmpChar11;
dest1[i+12] = dest1[i+12] & tmpChar12;
dest1[i+13] = dest1[i+13] & tmpChar13;
dest1[i+14] = dest1[i+14] & tmpChar14;
dest1[i+15] = dest1[i+15] & tmpChar15;
dest1[i+16] = dest1[i+16] & tmpChar16;
dest1[i+17] = dest1[i+17] & tmpChar17;
dest1[i+18] = dest1[i+18] & tmpChar18;
dest1[i+19] = dest1[i+19] & tmpChar19;
dest1[i+20] = dest1[i+20] & tmpChar20;
dest1[i+21] = dest1[i+21] & tmpChar21;
dest1[i+22] = dest1[i+22] & tmpChar22;
dest1[i+23] = dest1[i+23] & tmpChar23;
dest1[i+24] = dest1[i+24] & tmpChar24;
dest1[i+25] = dest1[i+25] & tmpChar25;
dest1[i+26] = dest1[i+26] & tmpChar26;
dest1[i+27] = dest1[i+27] & tmpChar27;
dest1[i+28] = dest1[i+28] & tmpChar28;
dest1[i+29] = dest1[i+29] & tmpChar29;
dest1[i+30] = dest1[i+30] & tmpChar30;
dest1[i+31] = dest1[i+31] & tmpChar31;
dest2[i] = dest2[i] & tmpChar0;
dest2[i+1] = dest2[i+1] & tmpChar1;
dest2[i+2] = dest2[i+2] & tmpChar2;
dest2[i+3] = dest2[i+3] & tmpChar3;
dest2[i+4] = dest2[i+4] & tmpChar4;
dest2[i+5] = dest2[i+5] & tmpChar5;
dest2[i+6] = dest2[i+6] & tmpChar6;
dest2[i+7] = dest2[i+7] & tmpChar7;
dest2[i+8] = dest2[i+8] & tmpChar8;
dest2[i+9] = dest2[i+9] & tmpChar9;
dest2[i+10] = dest2[i+10] & tmpChar10;
dest2[i+11] = dest2[i+11] & tmpChar11;
dest2[i+12] = dest2[i+12] & tmpChar12;
dest2[i+13] = dest2[i+13] & tmpChar13;
dest2[i+14] = dest2[i+14] & tmpChar14;
dest2[i+15] = dest2[i+15] & tmpChar15;
dest2[i+16] = dest2[i+16] & tmpChar16;
dest2[i+17] = dest2[i+17] & tmpChar17;
dest2[i+18] = dest2[i+18] & tmpChar18;
dest2[i+19] = dest2[i+19] & tmpChar19;
dest2[i+20] = dest2[i+20] & tmpChar20;
dest2[i+21] = dest2[i+21] & tmpChar21;
dest2[i+22] = dest2[i+22] & tmpChar22;
dest2[i+23] = dest2[i+23] & tmpChar23;
dest2[i+24] = dest2[i+24] & tmpChar24;
dest2[i+25] = dest2[i+25] & tmpChar25;
dest2[i+26] = dest2[i+26] & tmpChar26;
dest2[i+27] = dest2[i+27] & tmpChar27;
dest2[i+28] = dest2[i+28] & tmpChar28;
dest2[i+29] = dest2[i+29] & tmpChar29;
dest2[i+30] = dest2[i+30] & tmpChar30;
dest2[i+31] = dest2[i+31] & tmpChar31;
}
}
}
对于两行版本,Omp挂钟时间 17ms ,对于单行版本, 1.5ms 。 (经过几千次迭代)
问题:可能导致9倍加速的原因是什么?数据重用src []或缓存利用率?也许代码不适合之前的矢量化,并且展开启用它,它可能是索引的缺失模数操作吗?我应该在循环顶部添加#pragma omp吗?
编辑: #pragma omp parallel for num_threads(4)让它变得更糟。也许只有1MB没有隐藏多线程开销?
编辑2:省略模数运算符使其为2.5毫秒,但性能从2.5毫秒增加到1.5毫秒(〜%60增加)必须来自展开/缓存/ ...
注意:启用“青睐快速代码”,启用整个程序优化,完全优化(/ Ox)和启用内在函数并没有改变速度(可能启用SSE2和/ O2就足够了)
CPU:FX8150 IDE:Msvc
答案 0 :(得分:3)
答案 1 :(得分:2)
在比较这两个函数时,有几点需要考虑;从高级别的角度来看,看起来2行版本应该比1行版本运行得更快,但是:
- the 1 line version has better instruction cache locatity
- the 1 line version puts less pressure on the registers( this is important
because if the compiler runs out of registers to use it may start using the
stack to move parameters around for the instructions it needs - which causes
an overhead)
作为优化提示:
- you can try and use the "restrict" keyword: **void membwand(void * _restrict_
destv, void * _restrict_ srcv, size_t destsize, size_t srcsize);** This tells the
compiler that the two pointers are not aliasing - they are will never point at the
same memory location so the read/write instruction can be executed in the same cycle.