Question

我已经编写了一些模拟代码并使用了“随机中断GDB”调试方法。我发现99.9％的程序花费在这个例程上（这是最小的图像约定）：

inline double distanceSqPeriodic(double const * const position1, double const * const position2, double boxWidth) {
        double xhw, yhw, zhw, x, y, z;                                                                                 

        xhw = boxWidth / 2.0;                                                                                      
        yhw = xhw;                                                                                      
        zhw = xhw;                                                                                      

        x = position2[0] - position1[0];                                                                               
        if (x > xhw) 
                x -= boxWidth;                                                                                     
        else if (x < -xhw)
                x += boxWidth;                                                                                     


        y = position2[1] - position1[1];                                                                               
        if (y > yhw) 
                y -= boxWidth;                                                                                     
        else if (y < -yhw)
                y += boxWidth;                                                                                     


        z = position2[2] - position1[2];                                                                               
        if (z > zhw) 
                z -= boxWidth;                                                                                     
        else if (z < -zhw)
                z += boxWidth;                                                                                     

        return x * x + y * y + z * z;                                                                                  
}

到目前为止我执行的优化（可能不是很重要）：

返回距离的平方而不是平方根
内联它
我能做什么
没有标准库膨胀
使用我能想到的每个g ++优化标志进行编译

我用尽了我可以用的东西。也许我可以使用花车而不是双打，但我宁愿这是最后的手段。也许我可以在某种程度上使用SIMD，但我从来没有这样做，所以我想这是很多工作。有什么想法吗？

由于

Answer 1

首先，您没有使用正确的算法。如果这两个点大于boxWidth怎么办？其次，如果你有多个粒子，调用一个执行所有距离计算并将结果放在输出缓冲区中的函数将会显着提高效率。内联有助于减少部分内容，但不是全部。任何预先计算 - 比如在算法中将方框长度除以2 - 将在不需要时重复进行。

这是一些用于计算的SIMD代码。您需要使用-msse4进行编译。在我的机器上使用-O3（macbook pro，llvm-gcc-4.2），我的速度提高了大约2倍。这确实需要使用32位浮点数而不是双精度算法。

SSE真的不那么复杂，它只是看起来很糟糕。例如而不是写一个* b，你必须写出笨重的_mm_mul_ps（a，b）。

#include <stdio.h> #include <stdlib.h> #include <time.h> #include <smmintrin.h> // you can compile this code with -DDOUBLE to try using doubles vs. floats // in the unoptimized code. The SSE code uses only floats. #ifdef DOUBLE typedef double real; #else typedef float real; #endif static inline __m128 loadFloat3(const float const* value) { // Load (x,y,z) into a SSE register, leaving the last entry // set to zero. __m128 x = _mm_load_ss(&value[0]); __m128 y = _mm_load_ss(&value[1]); __m128 z = _mm_load_ss(&value[2]); __m128 xy = _mm_movelh_ps(x, y); return _mm_shuffle_ps(xy, z, _MM_SHUFFLE(2, 0, 2, 0)); } int fdistanceSqPeriodic(float* position1, float* position2, const float boxWidth, float* out, const int n_points) { int i; __m128 r1, r2, r12, s12, r12_2, s, box, invBox; box = _mm_set1_ps(boxWidth); invBox = _mm_div_ps(_mm_set1_ps(1.0f), box); for (i = 0; i < n_points; i++) { r1 = loadFloat3(position1); r2 = loadFloat3(position1); r12 = _mm_sub_ps(r1, r2); s12 = _mm_mul_ps(r12, invBox); s12 = _mm_sub_ps(s12, _mm_round_ps(s12, _MM_FROUND_TO_NEAREST_INT)); r12 = _mm_mul_ps(box, s12); r12_2 = _mm_mul_ps(r12, r12); // double horizontal add instruction accumulates the sum of // all four elements into each of the elements // (e.g. s.x = s.y = s.z = s.w = r12_2.x + r12_2.y + r12_2.z + r12_2.w) s = _mm_hadd_ps(r12_2, r12_2); s = _mm_hadd_ps(s, s); _mm_store_ss(out++, s); position1 += 3; position2 += 3; } return 1; } inline real distanceSqPeriodic(real const * const position1, real const * const position2, real boxWidth) { real xhw, yhw, zhw, x, y, z; xhw = boxWidth / 2.0; yhw = xhw; zhw = xhw; x = position2[0] - position1[0]; if (x > xhw) x -= boxWidth; else if (x < -xhw) x += boxWidth; y = position2[1] - position1[1]; if (y > yhw) y -= boxWidth; else if (y < -yhw) y += boxWidth; z = position2[2] - position1[2]; if (z > zhw) z -= boxWidth; else if (z < -zhw) z += boxWidth; return x * x + y * y + z * z; } int main(void) { real* position1; real* position2; real* output; int n_runs = 10000000; posix_memalign((void**) &position1, 16, n_runs*3*sizeof(real)); posix_memalign((void**) &position2, 16, n_runs*3*sizeof(real)); posix_memalign((void**) &output, 16, n_runs*sizeof(real)); real boxWidth = 1.8; real result = 0; int i; clock_t t; #ifdef OPT printf("Timing optimized SSE implementation\n"); #else printf("Timinig original implementation\n"); #endif #ifdef DOUBLE printf("Using double precision\n"); #else printf("Using single precision\n"); #endif t = clock(); #ifdef OPT fdistanceSqPeriodic(position1, position2, boxWidth, output, n_runs); #else for (i = 0; i < n_runs; i++) { *output = distanceSqPeriodic(position1, position2, boxWidth); position1 += 3; position2 += 3; output++; } #endif t = clock() - t; printf("It took me %d clicks (%f seconds).\n", (int) t, ((float)t)/CLOCKS_PER_SEC); }

Answer 2

您可能希望使用fab（在ISO 90 C中标准化），因为这应该可以简化为单个非分支指令。

Answer 3

Return the square of the distance instead of the square root

只要您将正方形与正方形进行比较，这是一个好主意。

Inline it

这有时是一种反优化：内联代码占用执行管道/缓存中的空间，无论它是否分支。

通常它没有区别，因为编译器有关于是否内联的最终决定。

Const what I can

通常没有任何区别。

No standard library bloat

什么膨胀？

Compiling with every g++ optimization flag I can think of

这很好：将大多数优化留给编译器。只有当你衡量了真正的瓶颈，并确定是否存在重大瓶颈时，才能投入资金进行优化。

您可以尝试做的是让您的代码无分支。不使用位掩码，这可能如下所示：

//if (z > zhw) 
//        z -= boxWidths[2];
//else if (z < -zhw)
//      z += boxWidths[2]; 

const auto z_a[] = {
    z,
    z - boxWidths[2]
};
z = z_a[z>zhw];
...

或

z -= (z>zhw) * boxWidths[2];

然而，无法保证这更快。您的编译器现在可能很难识别代码中的SIMD点，或者分支目标缓冲区做得很好，并且大多数时候您的函数都有相同的代码路径。

Answer 4

你需要摆脱比较，因为这些很难预测。

要实施的功能是：

     /    /           /\  /\
    /    /           /  \/  \
    ----0-----  or ------------ , as (-x)^2 == x^2
       /    / 
      /    /

后者是两个绝对陈述的结果：

 x = abs(half-abs(diff))+half;

代码

double tst(double a[4], double b[4], double half)
{
   double sum=0.0,t;
   int i;
   for (i=0;i<3;i++) { t=fabs(fabs(b[i]-a[i])-half)-half; sum+=t*t;}
   return sum;
}

比原始实现快4倍（+ some） - 此时甚至没有完全并行性：只使用xmm寄存器的下半部分。

并行处理x＆amp;＆amp; y，理论上的收益约为50％。从理论上讲，使用浮点数代替双精度数可以使其速度提高约3倍。

需要帮助优化代码（最小图像约定）

4 个答案: