Question

从理论上来说，我最近已经被引入Vector指令，并且对如何使用它们来加快我的应用程序感到很兴奋。

我想改进的一个方面是非常热的循环：

__declspec(noinline) void pleaseVectorize(int* arr, int* someGlobalArray, int* output)
{
    for (int i = 0; i < 16; ++i)
    {
        auto someIndex = arr[i];
        output[i] = someGlobalArray[someIndex];
    }

    for (int i = 0; i < 16; ++i)
    {
         if (output[i] == 1)
         {
             return i;
         }
    }

    return -1;
}

但是，当然，所有3个主要的编译器（msvc，gcc，clang）都拒绝对此向量化。我可以理解为什么，但是我想得到确认。

如果我必须手工对此进行矢量化，那就是：

（1）VectorLoad“ arr”，这将16个4字节整数带入zmm0中

（2）16个存储器从zmm0 [0..3]指向的地址加载到zmm1 [0..3]，从zmm0 [4..7]指向的地址加载到zmm1 [4 ..]。 7]等等

（3）比较zmm0和zmm1

（4）将向量popcnt输入到输出中以找出最高有效位，并基本上将其除以8以获得匹配的索引

首先，矢量指令可以执行这些操作吗？就像他们可以执行此“收集”操作一样，即是否从指向zmm0的地址进行加载？

这是c生成的内容：

0000000000400530 <_Z5superPiS_S_>:
  400530:       48 63 07                movslq (%rdi),%rax
  400533:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400536:       89 02                   mov    %eax,(%rdx)
  400538:       48 63 47 04             movslq 0x4(%rdi),%rax
  40053c:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40053f:       89 42 04                mov    %eax,0x4(%rdx)
  400542:       48 63 47 08             movslq 0x8(%rdi),%rax
  400546:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400549:       89 42 08                mov    %eax,0x8(%rdx)
  40054c:       48 63 47 0c             movslq 0xc(%rdi),%rax
  400550:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400553:       89 42 0c                mov    %eax,0xc(%rdx)
  400556:       48 63 47 10             movslq 0x10(%rdi),%rax
  40055a:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40055d:       89 42 10                mov    %eax,0x10(%rdx)
  400560:       48 63 47 14             movslq 0x14(%rdi),%rax
  400564:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400567:       89 42 14                mov    %eax,0x14(%rdx)
  40056a:       48 63 47 18             movslq 0x18(%rdi),%rax
  40056e:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400571:       89 42 18                mov    %eax,0x18(%rdx)
  400574:       48 63 47 1c             movslq 0x1c(%rdi),%rax
  400578:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40057b:       89 42 1c                mov    %eax,0x1c(%rdx)
  40057e:       48 63 47 20             movslq 0x20(%rdi),%rax
  400582:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400585:       89 42 20                mov    %eax,0x20(%rdx)
  400588:       48 63 47 24             movslq 0x24(%rdi),%rax
  40058c:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40058f:       89 42 24                mov    %eax,0x24(%rdx)
  400592:       48 63 47 28             movslq 0x28(%rdi),%rax
  400596:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400599:       89 42 28                mov    %eax,0x28(%rdx)
  40059c:       48 63 47 2c             movslq 0x2c(%rdi),%rax
  4005a0:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005a3:       89 42 2c                mov    %eax,0x2c(%rdx)
  4005a6:       48 63 47 30             movslq 0x30(%rdi),%rax
  4005aa:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005ad:       89 42 30                mov    %eax,0x30(%rdx)
  4005b0:       48 63 47 34             movslq 0x34(%rdi),%rax
  4005b4:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005b7:       89 42 34                mov    %eax,0x34(%rdx)
  4005ba:       48 63 47 38             movslq 0x38(%rdi),%rax
  4005be:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005c1:       89 42 38                mov    %eax,0x38(%rdx)
  4005c4:       48 63 47 3c             movslq 0x3c(%rdi),%rax
  4005c8:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005cb:       89 42 3c                mov    %eax,0x3c(%rdx)
  4005ce:       c3                      retq
  4005cf:       90                      nop

Answer 1

除了希望获得比较位图的bit-scan / find-first-set-bit（x86 BSF或TZCNT）而不是填充计数（ number 的位设置。

AVX2 / AVX512具有vpgatherdd，它确实使用带符号的32位缩放索引向量。在Haswell上几乎不值得使用，在Broadwell上进行了改进，在Skylake上也非常出色。（{http://agner.org/optimize/，以及the x86 tag wiki中的其他链接，例如Intel的优化手册，其中有关于收集性能的部分）。与之相比，SIMD比较和位扫描非常便宜；单个uop和完整的流水线。

gcc8.1可以自动向量化收集的数据，可以证明您的输入与output函数arg 不重叠。内联之后有时可能会出现，但是对于非内联版本，您可以使用int * __restrict output来保证。或者，如果您将output设为本地临时文件而不是函数arg。（一般规则：通过非_restrict指针进行存储通常会禁止自动矢量化，尤其是如果它是char*可以别名任何东西时。）

gcc和clang从不对搜索循环进行矢量化处理；进入循环之前只能计算跳闸计数的循环。但是 ICC可以；它会进行标量收集并存储结果（即使output[]是局部变量，因此也没有必须将其作为运行函数的副作用），然后使用SIMD打包比较+位扫描。

Compiler output for a __restrict version。请注意，在调优Skylake-AVX512时，默认情况下gcc8.1和ICC避免使用512位向量。 512位矢量可以限制最大涡轮增压，并且始终在流水线中时关闭端口1上的矢量ALU，因此将AVX512或AVX2与256位矢量一起使用是有意义的，以防仅此功能大程序的一小部分。（编译器不知道该功能在您的程序中非常热。）

如果output[]是本地的，则更好的代码生成策略可能是在收集时进行比较，因此，较早的命中将跳过其余的负载。完全标量的编译器（clang和MSVC）都错过了此优化。实际上，即使clang通常不重新读取它，它们甚至存储到本地数组（将结果保存在寄存器中）。使用第一个循环中的compare编写源代码将有助于获得更好的标量代码。（取决于收集的缓存未命中与非SIMD搜索的分支错误预测，标量可能是一个不错的策略。尤其是如果前几个元素的命中率很常见。当前的收集硬件无法利用来自相同的缓存行，因此硬限制仍然是每个时钟周期加载2个元素。但是，如果您的数据在高速缓存中很热，那么对索引使用较大的向量负载来馈入聚集将大大降低负载端口/高速缓存的访问压力。）

编译器可以将代码的__restrict版本自动矢量化为类似的形式。（gcc管理收集部分，ICC管理SIMD比较部分）

;; Windows x64 calling convention: rcx,rdx, r8,r9 ; but of course you'd actually inline this ; only uses ZMM16..31, so vzeroupper not required vmovdqu32 zmm16, [rcx/arr] ; You def. want to reach an alignment boundary if you can for ZMM loads, vmovdqa32 will enforce that kxnorw k1, k0,k0 ; k1 = -1. k0 false dep is likely not a problem. ; optional: vpxord xmm17, xmm17, xmm17 ; break merge-masking false dep vpgatherdd zmm17{k1}, [rdx + zmm16 * 4] ; GlobalArray + scaled-vector-index ; sets k1 = 0 when done vmovdqu32 [r8/output], zmm17 vpcmpd k1, zmm17, zmm31, 0 ; 0->EQ. Outside the loop, do zmm31=set1_epi32(1) ; k1 = compare bitmap kortestw k1, k1 jz .not_found ; early check for not-found kmovw edx, k1 ; tzcnt doesn't have a false dep on the output on Skylake ; so no AVX512 CPUs need to worry about that HSW/BDW issue tzcnt eax, edx ; bit-scan for the first (lowest-address) set element ; input=0 produces output=32 ; or avoid the branch and let 32 be the not-found return value. ; or do a branchless kortestw / cmov if -1 is directly useful without branching ret .not_found: mov eax, -1 ret

您可以自己使用内在函数：

Intel的指令集参考手册（位于http://felixcloutier.com/x86/index.html的HTML摘录）包括每个指令的C / C ++固有名称，或在https://software.intel.com/sites/landingpage/IntrinsicsGuide/中搜索它们

我将output的类型更改为__m512i。如果不手动向量化调用者，则可以将其改回数组。 您肯定希望此函数内联。

#include <immintrin.h> //__declspec(noinline) // I *hope* this was just to see the stand-alone asm version // but it means the output array can't optimize away at all //static inline int find_first_1(const int *__restrict arr, const int *__restrict someGlobalArray, __m512i *__restrict output) { __m512i vindex = _mm512_load_si512(arr); __m512i gather = _mm512_i32gather_epi32(vindex, someGlobalArray, 4); // indexing by 4-byte int *output = gather; __mmask16 cmp = _mm512_cmpeq_epi32_mask(gather, _mm512_set1_epi32(1)); // Intrinsics make masks freely convert to integer // even though it costs a `kmov` instruction either way. int onepos = _tzcnt_u32(cmp); if (onepos >= 16){ return -1; } return onepos; }

所有4个x86编译器都会产生与我建议的{see it on the Godbolt compiler explorer）相似的asm，但是当然它们必须实际实现set1_epi32(1)向量常量，或使用（广播的）内存操作数。 Clang实际上使用{1to16}来自常量的广播负载来进行比较：vpcmpeqd k0, zmm1, dword ptr [rip + .LCPI0_0]{1to16}。（当然，如果内联成一个循环，它们将做出不同的选择。）其他人则使用mov eax,1 / vpbroadcastd zmm0, eax。

gcc8.1 -O3 -march = skylake-avx512具有两个冗余的mov eax, -1指令：一个指令为收集提供kmov，另一个指令为返回值的东西。愚蠢的编译器应该保留它，并为1使用不同的寄存器。

所有人都使用zmm0..15，因此无法避免使用vzeroupper。（xmm16.31无法通过legacy-SSE访问，因此，如果您使用的唯一宽向量寄存器是y / zmm16..31，则the SSE/AVX transition penalty problem that vzeroupper solves不存在）。 vzeroupper可能仍具有微小的可能优势，例如在已知ymm或zmm reg的上半部分为零（Is it useful to use VZEROUPPER if your program+libraries contain no SSE instructions?）时更便宜的上下文切换。如果仍要使用它，则没有理由避免使用xmm0..15。

哦，在Windows调用约定中，xmm6..15被保留。（不是ymm / zmm，只有低128位），因此如果您用完了xmm0..5 regs，则zmm16..31是一个不错的选择。

通过AVX指令向量化间接访问

1 个答案: