Question

我有代码，我正在努力加快速度。首先，我使用了SSE内在函数并获得了显着的收益。我现在正试图看看我是否可以用AVX内在函数做类似的事情。代码本质上需要两个数组，根据需要添加或减去它们，对结果进行平方，然后将所有这些方块相加。

下面是使用sse intrinsics的代码的简化版本：

float chiList[4] __attribute__((aligned(16)));
float chi = 0.0;
__m128 res;
__m128 nres;
__m128 del;
__m128 chiInter2;
__m128 chiInter;
while(runNum<boundary)
{
    chiInter = _mm_setzero_ps();
    for(int i=0; i<maxPts; i+=4)
    {
        //load the first batch of residuals and deltas
        res = _mm_load_ps(resids+i);
        del = _mm_load_ps(residDeltas[param]+i);
        //subtract them
        nres = _mm_sub_ps(res,del);
        //load them back into memory
        _mm_store_ps(resids+i,nres);
        //square them and add them back to chi with the fused
        //multiply and add instructions
        chiInter = _mm_fmadd_ps(nres, nres, chiInter);
    }
    //add the 4 intermediate this way because testing 
    //shows it is faster than the commented out way below
    //so chiInter2 has chiInter reversed
    chiInter2 = _mm_shuffle_ps(chiInter,chiInter,_MM_SHUFFLE(0,1,2,3));
    //add the two
    _mm_store_ps(chiList,_mm_add_ps(chiInter,chiInter2));
    //add again
    chi=chiList[0]+chiList[1];
    //now do stuff with the chi^2
    //alternatively, the slow way
    //_mm_store_ps(chiList,chiInter);
    //chi=chiList[0]+chiList[1]+chiList[2]+chiList[3];
}

这让我想到了第一个问题：有没有办法做最后一点（我在chiInter中使用4个浮点数并将它们合并为一个浮点数）更优雅？

无论如何，我现在正试图使用avx内在函数来实现这一点，大部分过程非常简单，遗憾的是我试图将最后一点压缩，尝试将8个中间chi值压缩为单个值。 / p>

以下是avx内在函数的类似简化代码：

float chiList[8] __attribute__((aligned(32)));
__m256 res;
__m256 del;
__m256 nres;
__m256 chiInter;
while(runNum<boundary)
{
    chiInter = _mm256_setzero_ps();
    for(int i=0; i<maxPts; i+=8)
    {
        //load the first batch of residuals and deltas
        res = _mm256_load_ps(resids+i);
        del = _mm256_load_ps(residDeltas[param]+i);
        //subtract them
        nres = _mm256_sub_ps(res,del);
        //load them back into memory
        _mm256_store_ps(resids+i,nres);
        //square them and add them back to chi with the fused
        //multiply and add instructions
        chiInter = _mm256_fmadd_ps(nres, nres, chiInter);
    }
    _mm256_store_ps(chiList,chiInter);
    chi=chiList[0]+chiList[1]+chiList[2]+chiList[3]+
        chiList[4]+chiList[5]+chiList[6]+chiList[7];
}

我的第二个问题是：是否有一些方法，比如我用上面的SSE拉出来让我更快地完成最后的添加？或者，如果有更好的方法来做我在SSE内在函数中所做的事情，它是否具有AVX内在函数的等价物？

Answer 1

此操作称为水平和。假设你有一个向量v={x0,x1,x2,x3,x4,x5,x6,x7}。首先，提取高/低部分，以便w1={x0,x1,x2,x3}和w2={x4,x5,x6,x7}。现在请致电_mm_hadd_ps(w1, w2)，其中包含：tmp1={x0+x1,x2+x3,x4+x5,x6+x7}。同样，_mm_hadd_ps(tmp1,tmp1)给出tmp2={x0+x1+x2+x3,x4+x5+x6+x7,...}。最后一次，_mm_hadd_ps(tmp2,tmp2)给出tmp3={x0+x1+x2+x3+x4+x5+x6+x7,...}。您也可以使用简单的_mm_hadd_ps替换第一个_mm_add_ps。

这些都是未经测试并从文档中编写的。对速度也没有承诺......

Intel forum上有人显示另一个变体（查找HsumAvxFlt）。

我们还可以通过使用gcc test.c -Ofast -mavx2 -S

编译此代码来查看gcc建议的内容

float f(float*t){
  t=(float*)__builtin_assume_aligned(t,32);
  float r=0;
  for(int i=0;i<8;i++)
    r+=t[i];
  return r;
}

生成的test.s包含：

vhaddps %ymm0, %ymm0, %ymm0
vhaddps %ymm0, %ymm0, %ymm1
vperm2f128  $1, %ymm1, %ymm1, %ymm0
vaddps  %ymm1, %ymm0, %ymm0

我对最后一条指令vaddss感到有些惊讶，但我想这并不重要。

使用sse和avx内在函数将一组压缩单一添加到一个值中

1 个答案: