Question

我从SIMD编程开始，但此刻我不知道该怎么做。我试图减少运行时间但是它反过来做。

void blurr2(double * u, double * r) {

    int i;
    double dos[2] = { 2.0, 2.0 };

    for (i = 0; i < SIZE - 1; i++) {
        r[i] = u[i] + u[i + 1];
    }
}

blurr2：0.43s

int contarNegativos(double * u) {

    int i;
    int contador = 0;

    for (i = 0; i < SIZE; i++) {
        if (u[i] < 0) {
            contador++;
        }
    }
    return contador;
}

negativeCount：1.38s

void ord(double * v, double * u, double * r) {

    int i;

    for (i = 0; i < SIZE; i += 2) {
        r[i] = *(__int64*)&(v[i]) | *(__int64*)&(u[i]);
    }
}

ord：0.33

这是我的SIMD代码：

https://codepaste.net/fbg1g5

void blurr2(double * u, double * r) {

    __m128d rp2;
    __m128d rdos;
    __m128d rr;
    int i;
    int sizeAux = SIZE % 2 == 1 ? SIZE : SIZE - 1;
    double dos[2] = { 2.0, 2.0 };

    rdos = *(__m128d*)dos;

    for (i = 0; i < sizeAux; i += 2) {
        rp2 = *(__m128d*)&u[i + 1];
        rr = _mm_add_pd(*(__m128d*)&u[i], rp2);
        *((__m128d*)&r[i]) = _mm_div_pd(rr, rdos);
    }
}

blurr2：0.42s

int contarNegativos(double * u) {

    __m128d rcero;
    __m128d rr;
    int i;
    double cero[2] = { 0.0, 0.0 };
    int contador = 0;

    rcero = *(__m128d*)cero;

    for (i = 0; i < SIZE; i += 2) {
        rr = _mm_cmplt_pd(*(__m128d*)&u[i], rcero);
        if (((__int64 *)&rr)[0]) {
            contador++;
        };
        if (((__int64 *)&rr)[1]) {
            contador++;
        };
    }
    return contador;
}

negativeCount：1.42s

void ord(double * v, double * u, double * r) {

    __m128d rr;
    int i;

    for (i = 0; i < SIZE; i += 2) {
        *((__m128d*)&r[i]) = _mm_or_pd(*(__m128d*)&v[i], *(__m128d*)&u[i]);
    }
}

ord：0.35s

**不同的解决方案。

你能解释一下我做错了什么吗？我有点失落......

Answer 1

使用_mm_loadu_pd代替指针转换和取消引用__m128d。您的代码保证在gcc / clang上进行段错误，其中假定__m128d已对齐。

blurr2：乘以0.5而不是除以2.它将多更快。（我在最后一两天用完全相同的代码对问题发表了同样的评论，那也是你吗？）

negativeCount：_mm_castpd_si128比较结果为整数，并将其与_mm_sub_epi64累加。（位模式为全零或全1，即2＆＃39的补码0 / -1）。

#include <immintrin.h>
#include <stdint.h>

static const size_t SIZE = 1024;

uint64_t countNegative(double * u) {
    __m128i counts = _mm_setzero_si128();
    for (size_t i = 0; i < SIZE; i += 2) {
        __m128d cmp = _mm_cmplt_pd(_mm_loadu_pd(&u[i]), _mm_setzero_pd());
        counts = _mm_sub_epi64(counts, _mm_castpd_si128(cmp));
    }

    //return counts[0] + counts[1];  // GNU C only, and less efficient
    // horizontal sum
    __m128i hi64  = _mm_shuffle_epi32(counts, _MM_SHUFFLE(1, 0, 3, 2));
    counts = _mm_add_epi64(counts, hi64);

    uint64_t scalarcount = _mm_cvtsi128_si64(counts);
    return scalarcount;
}

要了解有效矢量水平和的更多信息，请参阅Fastest way to do horizontal float vector sum on x86。但第一条规则是在外面循环。

（source + asm on the Godbolt compiler explorer）

从MSVC（我猜测你正在使用，或者你从*(__m128d*)foo获得段错误），内部循环是：

$LL4@countNegat:
    movups   xmm0, XMMWORD PTR [rcx]
    lea      rcx, QWORD PTR [rcx+16]
    cmpltpd xmm0, xmm2
    psubq    xmm1, xmm0
    sub      rax, 1
    jne      SHORT $LL4@countNegat

展开可能会更快（可能还有两个向量累加器），但这相当不错，可能会在Sandybridge / Haswell上接近每16字节1.25个时钟。（5个融合域uops的瓶颈）。

您的版本实际上正在解压缩到整个内部内部循环！如果您使用MSVC -Ox，它实际上是分支而不是使用无分支比较+条件添加。我很惊讶它并不比标量版本慢。

此外，(int64_t *)&rr违反严格别名。 char*可以为任何内容添加别名，但将其他指针强制转换为SIMD向量并期望它能够正常工作是不安全的。如果是的话，你很幸运。编译器通常为那个或内在函数生成类似的代码，对于正确的内在函数通常不会更糟。

Answer 2

您是否知道使用SIMD的ord功能在不使用SIMD指令的情况下不是1：1到ord功能？

在不使用SIMD的ord函数中，计算偶数索引的OR运算结果

r[0] = v[0] | u[0], 
r[2] = v[2] | u[2], 
r[4] = v[4] | u[4]

奇数索引是什么？也许，如果计算所有索引的OR运算，则需要比现在更多的时间。

SIMD程序运行缓慢

这是我的SIMD代码：

2 个答案: