Question

int eq3(int a, int b, int c, int d, int e, int f){
    return a == d || a == e || a == f 
        || b == d || b == e || b == f 
        || c == d || c == e || c == f;
}

此函数接收6个整数，如果3个第一个整数中的任何一个等于3个最后一个整数中的任何一个，则返回true。是否有任何类似的方式使其更快？

Answer 1

假设您期望false的结果率很高，您可以快速“预检”以快速隔离此类案例：

如果设置a中的某个位未在d，e和f中设置，则a不能等于任何位这些。

因此像

int pre_eq3(int a, int b, int c, int d, int e, int f){
    int const mask = ~(d | e | f);
    if ((a & mask) && (b & mask) && (c & mask)) {
         return false;
    }
    return eq3(a, b, c, d, e, f);
}

可以加速它（8次操作而不是 9 17，但如果结果实际上是true则成本要高得多）。如果mask == 0那么当然这没有帮助。

如果高概率a & b & c设置了一些位，则可以进一步改进：

int pre_eq3(int a, int b, int c, int d, int e, int f){
    int const mask = ~(d | e | f);
    if ((a & b & c) & mask) {
        return false;
    }
    if ((a & mask) && (b & mask) && (c & mask)) {
         return false;
    }
    return eq3(a, b, c, d, e, f);
}

现在，如果a，b和c的 all 都设置了位，d，e和c都没有设置任何位，那么我们的速度非常快。

Answer 2

如果你想要一个按位版本看xor。如果xor两个相同的数字，则答案将为0.否则，如果设置了一个而另一个未设置，则这些位将翻转。例如，1000 xor 0100是1100.

您拥有的代码可能会导致至少1次管道刷新，但除此之外，性能也会很好。

Answer 3

扩展dawg的SSE比较方法，您可以使用向量OR组合比较结果，并将比较结果的掩码移回整数以测试0 /非零。

此外，您可以更有效地将数据传输到向量中（尽管当它们存在于寄存器中时，将许多单独的整数转换为向量仍然非常笨重，而不是坐在内存中）。

你应该避免因为做三个小商店和一个大负荷而导致store-forwarding stalls。

///// UNTESTED ////////
#include <immintrin.h>
int eq3(int a, int b, int c, int d, int e, int f){

    // Use _mm_set to let the compiler worry about getting integers into vectors
    // Use -mtune=intel or gcc will make bad code, though :(
    __m128i abcc = _mm_set_epi32(0,c,b,a);  // args go from high to low position in the vector
    // masking off the high bits of the result-mask to avoid false positives
    // is cheaper than repeating c (to do the same compare twice)

    __m128i dddd = _mm_set1_epi32(d);
    __m128i eeee = _mm_set1_epi32(e);

    dddd = _mm_cmpeq_epi32(dddd, abcc);
    eeee = _mm_cmpeq_epi32(eeee, abcc);  // per element: 0(unequal) or -1(equal)
    __m128i combined = _mm_or_si128(dddd, eeee);

    __m128i ffff = _mm_set1_epi32(f);
    ffff = _mm_cmpeq_epi32(ffff, abcc);
    combined = _mm_or_si128(combined, ffff);

    // results of all the compares are ORed together.  All zero only if there were no hits
    unsigned equal_mask = _mm_movemask_epi8(combined);
    equal_mask &= 0x0fff;  // the high 32b element could have false positives
    return equal_mask;
    // return !!equal_mask if you want to force it to 0 or 1
    // the mask tells you whether it was a, b, or c that had a hit

    // movmskps would return a mask of just 4 bits, one for each 32b element, but might have a bypass delay on Nehalem.
    // actually, pmovmskb apparently runs in the float domain on Nehalem anyway, according to Agner Fog's table >.<
}

这编译为非常漂亮的asm，在clang和gcc之间非常相似，但是clang's -fverbose-asm puts nice comments on the shuffles。只有19条指令，包括ret，具有来自不同依赖链的相当大的并行度。使用-msse4.1或-mavx，它会保存另外两条指令。（但可能不会跑得更快）

使用clang，dawg的版本大约是两倍。使用gcc，会发生一些不好的事情，这很糟糕（超过80条指令。看起来像gcc优化错误，因为它看起来比简单的源代码翻译更糟糕）。即使是clang的版本也花费了很长时间才能将数据导入/导出向量寄存器，这样可以更快地进行无分支比较和OR真值一起。

这适用于合适的代码：

// 8bit variable doesn't help gcc avoid partial-register stalls even with -mtune=core2 :/
int eq3_scalar(int a, int b, int c, int d, int e, int f){
    char retval = (a == d) | (a == e) | (a == f)
         | (b == d) | (b == e) | (b == f)
         | (c == d) | (c == e) | (c == f);
    return retval;
}

如何将来自调用者的数据转换为向量注册表。如果三个一组来自记忆，那么概率。传递指针使得矢量加载可以从原始位置获取它们是最好的。在去向量的路上经过整数寄存器很糟糕（更高的延迟，更多的uops），但是如果你的数据已经存在于regs中，则执行整数存储然后向量加载是一种损失。 gcc是愚蠢的，遵循AMD优化指南建议通过内存反弹，尽管Agner Fog说他发现即使在AMD CPU上也不值得。这对英特尔来说肯定更糟糕，而且显然在AMD上更糟糕或者更糟糕，所以对-mtune=generic来说绝对是错误的选择。总之...

我们还可以通过两次打包矢量比较来完成9次比较中的8次。

第9个可以用整数比较完成，并且其真值可以与向量结果进行或运算。在一些CPU上（特别是AMD，也许是Intel Haswell及其后），根本没有将6个整数中的一个转移到矢量regs可能是一个胜利。混合三个整数无分支比较和矢量shuffles /比较将很好地交错它们。

这些矢量比较可以通过在整数数据上使用shufps来设置（因为它可以组合来自两个源寄存器的数据）。这在大多数CPU上都很好，但在使用内在函数而不是实际的asm时需要大量恼人的转换。即使存在旁路延迟，与punpckldq和pshufd之类的东西相比也不是很糟糕。

aabb    ccab
====    ====
dede    deff

c==f

用asm之类的东西：

#### untested
# pretend a is in eax, and so on
movd     xmm0, eax
movd     xmm1, ebx
movd     xmm2, ecx

shl      rdx, 32
#mov     edi, edi     # zero the upper 32 of rdi if needed, or use shld instead of OR if you don't care about AMD CPUs
or       rdx, rdi     # de in an integer register.
movq     xmm3, rdx    # de, aka (d<<32)|e
# in 32bit code, use a vector shuffle of some sort to do this in a vector reg, or:
#pinsrd  xmm3, edi, 1  # SSE4.1, and 2 uops (same as movd+shuffle)
#movd    xmm4, edi    # e

movd     xmm5, esi    # f

shufps   xmm0, xmm1, 0            #  xmm0=aabb  (low dword = a; my notation is backwards from left/right vector-shift perspective)
shufps   xmm5, xmm3, 0b01000000   # xmm5 = ffde  
punpcklqdq xmm3, xmm3            # broadcast: xmm3=dede
pcmpeqd  xmm3, xmm0              # xmm3: aabb == dede

# spread these instructions out between vector instructions, if you aren't branching
xor      edx,edx
cmp      esi, ecx     # c == f
#je   .found_match    # if there's one of the 9 that's true more often, make it this one.  Branch mispredicts suck, though
sete     dl

shufps   xmm0, xmm2, 0b00001000  # xmm0 = abcc
pcmpeqd  xmm0, xmm5              # abcc == ffde

por      xmm0, xmm3
pmovmskb eax, xmm0    # will have bits set if cmpeq found any equal elements
or       eax, edx     # combine vector and scalar compares
jnz  .found_match
# or record the result instead of branching on it
setnz    dl

这也是19个指令（不计算最终的jcc / setcc），但其中一个是xor-zeroing成语，还有其他简单的整数指令。（更短的编码，一些可以在Haswell +上的port6上运行，它无法处理向量指令）。由于构建abcc的shuffle链，可能会有更长的dep链。

Answer 4

我认为使用SSE可能值得研究。

自从我写了任何内容以来，已经 20年，而不是基准测试，但是类似于：

#include <xmmintrin.h>
int cmp3(int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f){
    // returns -1 if any of a,b,c is eq to any of d,e,f
    // returns 0 if all a,b,c != d,e,f
    int32_t __attribute__ ((aligned(16))) vec1[4];
    int32_t __attribute__ ((aligned(16))) vec2[4];
    int32_t __attribute__ ((aligned(16))) vec3[4];
    int32_t __attribute__ ((aligned(16))) vec4[4];
    int32_t __attribute__ ((aligned(16))) r1[4];
    int32_t __attribute__ ((aligned(16))) r2[4];
    int32_t __attribute__ ((aligned(16))) r3[4];

    // fourth word is DNK
    vec1[0]=a;
    vec1[1]=b;
    vec1[2]=c;

    vec2[0]=vec2[1]=vec2[2]=d;
    vec3[0]=vec3[1]=vec3[2]=e;
    vec4[0]=vec4[1]=vec4[2]=f;

    __m128i v1 = _mm_load_si128((__m128i *)vec1);
    __m128i v2 = _mm_load_si128((__m128i *)vec2);
    __m128i v3 = _mm_load_si128((__m128i *)vec3);
    __m128i v4 = _mm_load_si128((__m128i *)vec4);

    // any(a,b,c) == d? 
    __m128i vcmp1 = _mm_cmpeq_epi32(v1, v2);
    // any(a,b,c) == e?
    __m128i vcmp2 = _mm_cmpeq_epi32(v1, v3);
    // any(a,b,c) == f?
    __m128i vcmp3 = _mm_cmpeq_epi32(v1, v4);

    _mm_store_si128((__m128i *)r1, vcmp1);
    _mm_store_si128((__m128i *)r2, vcmp2);
    _mm_store_si128((__m128i *)r3, vcmp3);

    // bit or the first three of each result.
    // might be better with SSE mask, but I don't remember how!
    return r1[0] | r1[1] | r1[2] |
           r2[0] | r2[1] | r2[2] |
           r3[0] | r3[1] | r3[2];
}

如果操作正确，没有分支的SSE应该快4倍到8倍。

Answer 5

如果您的编译器/体系结构支持vector extensions（例如clang和gcc），则可以使用以下方法：

#ifdef __SSE2__
#include <immintrin.h>
#elif defined __ARM_NEON
#include <arm_neon.h>
#elif defined __ALTIVEC__
#include <altivec.h>
//#elif ... TODO more architectures
#endif

static int hastrue128(void *x){
#ifdef __SSE2__
    return _mm_movemask_epi8(*(__m128i*)x);
#elif defined __ARM_NEON
    return vaddlvq_u8(*(uint8x16_t*)x);
#elif defined __ALTIVEC__
typedef __UINT32_TYPE__ v4si __attribute__ ((__vector_size__ (16), aligned(4), __may_alias__));
    return vec_any_ne(*(v4si*)x,(v4si){0});
#else
    int *y = x;
    return y[0]|y[1]|y[2]|y[3];
#endif
}

//if inputs will always be aligned to 16 add an aligned attribute
//otherwise ensure they are at least aligned to 4
int cmp3(  int* a  ,  int* b ){
typedef __INT32_TYPE__ i32x4 __attribute__ ((__vector_size__ (16), aligned(4), __may_alias__));
    i32x4 x = *(i32x4*)a, cmp, tmp, y0 = y0^y0, y1 = y0, y2 = y0;
    //start vectors off at 0 and add the int to each element for optimization
    //it adds the int to each element, but since we started it at zero,
    //a good compiler (not ICC at -O3) will skip the xor and add and just broadcast/whatever
    y0 += b[0];
    y1 += b[1];
    y2 += b[2];
    cmp =  x == y0;
    tmp =  x == y1; //ppc complains if we don't use temps here
    cmp |= tmp;
    tmp =  x ==y2;
    cmp |= tmp;
    //now hack off then end since we only need 3
    cmp &= (i32x4){0xffffffff,0xffffffff,0xffffffff,0};
    return hastrue128(&cmp);
}

int cmp4(  int* a  ,  int* b ){
typedef __INT32_TYPE__ i32x4 __attribute__ ((__vector_size__ (16), aligned(4), __may_alias__));
    i32x4 x = *(i32x4*)a, cmp, tmp, y0 = y0^y0, y1 = y0, y2 = y0, y3 = y0;
    y0 += b[0];
    y1 += b[1];
    y2 += b[2];
    y3 += b[3];
    cmp =  x == y0;
    tmp =  x == y1; //ppc complains if we don't use temps here
    cmp |= tmp;
    tmp =  x ==y2;
    cmp |= tmp;
    tmp =  x ==y3;
    cmp |= tmp;
    return hastrue128(&cmp);
}

在arm64上，它将编译为以下无分支代码：

cmp3:
        ldr     q2, [x0]
        adrp    x2, .LC0
        ld1r    {v1.4s}, [x1]
        ldp     w0, w1, [x1, 4]
        dup     v0.4s, w0
        cmeq    v1.4s, v2.4s, v1.4s
        dup     v3.4s, w1
        ldr     q4, [x2, #:lo12:.LC0]
        cmeq    v0.4s, v2.4s, v0.4s
        cmeq    v2.4s, v2.4s, v3.4s
        orr     v0.16b, v1.16b, v0.16b
        orr     v0.16b, v0.16b, v2.16b
        and     v0.16b, v0.16b, v4.16b
        uaddlv h0,v0.16b
        umov    w0, v0.h[0]
        uxth    w0, w0
        ret
cmp4:
        ldr     q2, [x0]
        ldp     w2, w0, [x1, 4]
        dup     v0.4s, w2
        ld1r    {v1.4s}, [x1]
        dup     v3.4s, w0
        ldr     w1, [x1, 12]
        dup     v4.4s, w1
        cmeq    v1.4s, v2.4s, v1.4s
        cmeq    v0.4s, v2.4s, v0.4s
        cmeq    v3.4s, v2.4s, v3.4s
        cmeq    v2.4s, v2.4s, v4.4s
        orr     v0.16b, v1.16b, v0.16b
        orr     v0.16b, v0.16b, v3.16b
        orr     v0.16b, v0.16b, v2.16b
        uaddlv h0,v0.16b
        umov    w0, v0.h[0]
        uxth    w0, w0
        ret

在ICC x86_64 -march=skylake上，它会产生以下无分支代码：

cmp3:
        vmovdqu   xmm2, XMMWORD PTR [rdi]                       #27.24
        vpbroadcastd xmm0, DWORD PTR [rsi]                      #34.17
        vpbroadcastd xmm1, DWORD PTR [4+rsi]                    #35.17
        vpcmpeqd  xmm5, xmm2, xmm0                              #34.17
        vpbroadcastd xmm3, DWORD PTR [8+rsi]                    #37.16
        vpcmpeqd  xmm4, xmm2, xmm1                              #35.17
        vpcmpeqd  xmm6, xmm2, xmm3                              #37.16
        vpor      xmm7, xmm4, xmm5                              #36.5
        vpor      xmm8, xmm6, xmm7                              #38.5
        vpand     xmm9, xmm8, XMMWORD PTR __$U0.0.0.2[rip]      #40.5
        vpmovmskb eax, xmm9                                     #11.12
        ret                                                     #41.12
cmp4:
        vmovdqu   xmm3, XMMWORD PTR [rdi]                       #46.24
        vpbroadcastd xmm0, DWORD PTR [rsi]                      #51.17
        vpbroadcastd xmm1, DWORD PTR [4+rsi]                    #52.17
        vpcmpeqd  xmm6, xmm3, xmm0                              #51.17
        vpbroadcastd xmm2, DWORD PTR [8+rsi]                    #54.16
        vpcmpeqd  xmm5, xmm3, xmm1                              #52.17
        vpbroadcastd xmm4, DWORD PTR [12+rsi]                   #56.16
        vpcmpeqd  xmm7, xmm3, xmm2                              #54.16
        vpor      xmm8, xmm5, xmm6                              #53.5
        vpcmpeqd  xmm9, xmm3, xmm4                              #56.16
        vpor      xmm10, xmm7, xmm8                             #55.5
        vpor      xmm11, xmm9, xmm10                            #57.5
        vpmovmskb eax, xmm11                                    #11.12
        ret

它甚至可以与altivec一起在ppc64上运行-尽管绝对不是最佳选择

cmp3:
        lwa 10,4(4)
        lxvd2x 33,0,3
        vspltisw 11,-1
        lwa 9,8(4)
        vspltisw 12,0
        xxpermdi 33,33,33,2
        lwa 8,0(4)
        stw 10,-32(1)
        addi 10,1,-80
        stw 9,-16(1)
        li 9,32
        stw 8,-48(1)
        lvewx 0,10,9
        li 9,48
        xxspltw 32,32,3
        lvewx 13,10,9
        li 9,64
        vcmpequw 0,1,0
        lvewx 10,10,9
        xxsel 32,44,43,32
        xxspltw 42,42,3
        xxspltw 45,45,3
        vcmpequw 13,1,13
        vcmpequw 1,1,10
        xxsel 45,44,43,45
        xxsel 33,44,43,33
        xxlor 32,32,45
        xxlor 32,32,33
        vsldoi 1,12,11,12
        xxland 32,32,33
        vcmpequw. 0,0,12
        mfcr 3,2
        rlwinm 3,3,25,1
        cntlzw 3,3
        srwi 3,3,5
        blr
cmp4:
        lwa 10,8(4)
        lxvd2x 33,0,3
        vspltisw 10,-1
        lwa 9,12(4)
        vspltisw 11,0
        xxpermdi 33,33,33,2
        lwa 7,0(4)
        lwa 8,4(4)
        stw 10,-32(1)
        addi 10,1,-96
        stw 9,-16(1)
        li 9,32
        stw 7,-64(1)
        stw 8,-48(1)
        lvewx 0,10,9
        li 9,48
        xxspltw 32,32,3
        lvewx 13,10,9
        li 9,64
        xxspltw 45,45,3
        vcmpequw 13,1,13
        xxsel 44,43,42,45
        lvewx 13,10,9
        li 9,80
        vcmpequw 0,1,0
        xxspltw 45,45,3
        xxsel 32,43,42,32
        vcmpequw 13,1,13
        xxlor 32,32,44
        xxsel 45,43,42,45
        lvewx 12,10,9
        xxlor 32,32,45
        xxspltw 44,44,3
        vcmpequw 1,1,12
        xxsel 33,43,42,33
        xxlor 32,32,33
        vcmpequw. 0,0,11
        mfcr 3,2
        rlwinm 3,3,25,1
        cntlzw 3,3
        srwi 3,3,5
        blr

从生成的asm中可以看到，仍有一些改进的余地，但是它将在risc-v，mips，ppc和其他支持向量扩展的体系结构+编译器组合上进行编译。

是否有可能检查2组3个中的任何一个是否等于9个比较中的任何一个？

5 个答案:

我们还可以通过两次打包矢量比较来完成9次比较中的8次。