Question

我正在尝试使用OpenMP来对已经向量化的内部代码进行并行化，但问题是我使用一个XMM寄存器作为外部“变量”，我将每个循环递增。现在我正在使用shared子句

__m128d xmm0 = _mm_setzero_pd();
__declspec(align(16)) double res[2];

#pragma omp parallel for shared(xmm0)
for (int i = 0; i < len; i++)
{
    __m128d xmm7 = ... result of some operations

    xmm0 = _mm_add_pd(xmm0, xmm7);
}

_mm_store_pd(res, xmm0);
double final_result = res[0] + res[1];

因为不支持atomic操作（在VS2010中）

__m128d xmm0 = _mm_setzero_pd();
__declspec(align(16)) double res[2];

#pragma omp parallel for
for (int i = 0; i < len; i++)
{
    __m128d xmm7 = ... result of some operations

    #pragma omp atomic
    xmm0 = _mm_add_pd(xmm0, xmm7);
}

_mm_store_pd(res, xmm0);
double final_result = res[0] + res[1];

有没有人知道一个聪明的解决方法？

编辑：我刚刚使用Parallel Patterns Library尝试过它：

__declspec(align(16)) double res[2];
combinable<__m128d> xmm0_comb([](){return _mm_setzero_pd();});

parallel_for(0, len, 1, [&xmm0_comb, ...](int i)
{
    __m128d xmm7 = ... result of some operations

    __m128d& xmm0 = xmm0_comb.local();
    xmm0 = _mm_add_pd(xmm0, xmm7);
});

__m128d xmm0 = xmm0_comb.combine([](__m128d a, __m128d b){return _mm_add_pd(a, b);});
_mm_store_pd(res, xmm0);
double final_result = res[0] + res[1];

但结果令人失望。

Answer 1

你以错误的方式解决问题。您应该使用简化而不是原子操作：

这是一种更好的方法：

double sum = 0;

#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < len; i++)
{
    __m128d xmm7;// = ... result of some operations

    //  Collapse to a "double".
    _declspec(align(16)) double res[2];
    _mm_store_pd(res, xmm7);

    //  Add to reduction variable.
    sum += res[0] + res[1];
}

double final_result = sum;

缩减本质上是一种操作，它使用关联操作（例如+）将所有内容“缩减”为单个变量。

如果您正在减少，请始终尝试使用实际的减少方法。不要试图用原子操作或关键部分来欺骗它。

原因在于原子/关键部分方法本身不具有可扩展性，因为它们保持长关键路径数据依赖性。适当的减少方法可以将这一关键路径减少到log(# of threads)。

唯一的缺点当然是它破坏了浮点关联性。如果这很重要，那么你基本上会坚持按顺序总结每次迭代。

Answer 2

您正在寻找的是减少。如果您的编译器支持它（gcc确实如此），您可以将其作为omp减少，或者您可以通过汇总为每个线程的私有xmm来自行滚动。下面是两个简单的版本：

#include <emmintrin.h>
#include <omp.h>
#include <stdio.h>


int main(int argc, char **argv) {

    const int NTHREADS=8;
    const int len=100;

    __m128d xmm0[NTHREADS];
    __m128d xmmreduction = _mm_setzero_pd();
    #pragma omp parallel for num_threads(NTHREADS)
    for (int i=0; i<NTHREADS; i++)
        xmm0[i]= _mm_setzero_pd();

    __attribute((aligned(16))) double res[2];

    #pragma omp parallel num_threads(NTHREADS) reduction(+:xmmreduction)
    {
        int tid = omp_get_thread_num();
        #pragma omp for
        for (int i = 0; i < len; i++)
        {
            double d = (double)i;
            __m128d xmm7 = _mm_set_pd( d, 2.*d );

            xmm0[tid] = _mm_add_pd(xmm0[tid], xmm7);
            xmmreduction = _mm_add_pd(xmmreduction, xmm7);
        }
    }

    for (int i=1; i<NTHREADS; i++)
        xmm0[0] = _mm_add_pd(xmm0[0], xmm0[i]);

    _mm_store_pd(res, xmm0[0]);
    double final_result = res[0] + res[1];

    printf("Expected result   = %f\n", 3.0*(len-1)*(len)/2);
    printf("Calculated result = %lf\n", final_result);

    _mm_store_pd(res, xmmreduction);
    final_result = res[0] + res[1];

    printf("Calculated result (reduction) = %lf\n", final_result);

    return 0;
}

Answer 3

在回答我问题的人的大力帮助下，我想出了这个：

double final_result = 0.0;

#pragma omp parallel reduction(+:final_result)
{
    __declspec(align(16)) double r[2];
    __m128d xmm0 = _mm_setzero_pd();

    #pragma omp for
    for (int i = 0; i < len; i++)
    {
        __m128d xmm7 = ... result of some operations

        xmm0 = _mm_add_pd(xmm0, xmm7);
    }
    _mm_store_pd(r, xmm0);
    final_result += r[0] + r[1];
}

它基本上将崩溃和减少分开，表现非常好。

非常感谢所有帮助过我的人！

Answer 4

我猜你不能将自己的内在函数添加到编译器中，并且MS编译器决定跳过内联汇编程序。不确定有一个简单的解决方案。

OpenMP atomic _mm_add_pd

4 个答案: