为什么来自对齐的std :: array的初始自动矢量化加载是标量? (克++ /铛++)

时间:2016-12-24 06:39:32

标签: c++ g++ clang++ auto-vectorization

在从 std :: array< uint64_t,...> 读取数据时,我无法理解阻止编译器使用初始向量加载的问题。

我知道gcc可以使用-fopt-info-vec- *生成调试信息。我无法从详细日志中找到任何可以说明两个编译器为何使用初始标量加载做出相同的次优决策的原因。

另一方面,我不知道如何使clang提供有关矢量化问题的详细信息。 -Rpass-analysis = loop-vectorize只报告init中的循环不值得交错。当然,我的内在版本证明循环可以被矢量化,但是除了编译器之外,所需的转换可能太复杂了。

我当然可以使用内在函数实现热路径,但这需要为每个cpu argitecture复制相同的逻辑。我更喜欢编写标准c ++代码,编译器可以完美地向量化。使用target_clones属性或宏和目标属性,使用不同的标志多次编译相同的代码变得很简单。

如何让编译器说明为什么载荷无法矢量化?

我怀疑gcc可能已经打印出那些我不知道我在寻找什么的信息。

为什么自动矢量化会因初始加载而失败?

    /**
     * This is a test case removing abstraction layers from my actual code. My
     * real code includes one extra problem that access to pack loses alignment
     * information wasn't only issue. Compilers still generate
     * suboptimal machine code with alignment information present. I fail to
     * understand why loads are treated differently compared to stores to
     * same address when auto-vectorization is used.
     *
     * I tested gcc 6.2 and clang 3.9
     * g++ O3 -g -march=native vectest.cc -o vectest -fvect-cost-model=unlimited
     * clang++ -O3 -g -march=native vectest.cc -o vectest
     */


    #include <array>
    #include <cstdint>

    alignas(32) std::array<uint64_t, 52> pack;
    alignas(32) uint64_t board[4];

    __attribute__((noinline))
    static void init(uint64_t initial)
    {
        /* Clang seem to prefer large constant table and unrolled copy
         * which should perform worse outside micro benchmark. L1 misses
         * and memory bandwidth are bigger bottleneck than alu instruction
         * execution. But of course this code won't be compiled to hot path so
         * I don't care how it is compiled as long as it works correctly.
         *
         * But most interesting detail from clang is vectorized stores are
         * generated correctly like:
    4005db:       vpsllvq %ymm2,%ymm1,%ymm2
    4005e0:       vmovdqa %ymm2,0x200a78(%rip)        # 601060 <pack>
    4005e8:       vpaddq 0x390(%rip),%ymm0,%ymm2        # 400980 <_IO_stdin_used+0x60>
    4005f0:       vpsllvq %ymm2,%ymm1,%ymm2
    4005f5:       vmovdqa %ymm2,0x200a83(%rip)        # 601080 <pack+0x20>
    4005fd:       vpaddq 0x39b(%rip),%ymm0,%ymm2        # 4009a0 <_IO_stdin_used+0x80>
         *
         * gcc prefers scalar loop.
         */

        for (unsigned i = 0; i < pack.size(); i++) {
            pack[i] = 1UL << (i + initial);
        }
    }

    #include "immintrin.h"
    __attribute__((noinline))
    static void expected_init(uint64_t initial)
    {
        /** Just an intrinsic implementation of init that would be IMO ideal
         * optimization.
         */
    #if __AVX2__
        unsigned i;
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        __m256i t = _mm256_set_epi64x(
                1UL << 3,
                1UL << 2,
                1UL << 1,
                1UL << 0
                );
        /* initial is just extra random number to prevent constant array
         * initialization
         */
        t = _mm256_slli_epi64(t, initial);
        for(i = 0; i < pack.size()/4; i++) {
            _mm256_store_si256(&conv.avx[i], t);
            t = _mm256_slli_epi64(t, 4);
        }
    #endif
    }

    __attribute__((noinline))
    static void iter_or()
    {
        /** initial load (clang):
    4006f0:       vmovaps 0x200988(%rip),%xmm0        # 601080 <pack+0x20>
    4006f8:       vorps  0x200960(%rip),%xmm0,%xmm0        # 601060 <pack>
    400700:       vmovaps 0x200988(%rip),%xmm1        # 601090 <pack+0x30>
    400708:       vorps  0x200960(%rip),%xmm1,%xmm1        # 601070 <pack+0x10>
    400710:       vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
        * expected:
    400810:       vmovaps 0x200868(%rip),%ymm0        # 601080 <pack+0x20>
    400818:       vorps  0x200840(%rip),%ymm0,%ymm0        # 601060 <pack>
    400820:       vorps  0x200878(%rip),%ymm0,%ymm0        # 6010a0 <pack+0x40>
        */

        auto iter = pack.begin();
        uint64_t n(*iter++),
             e(*iter++),
             s(*iter++),
             w(*iter++);
        for (;iter != pack.end();) {
            n |= *iter++;
            e |= *iter++;
            s |= *iter++;
            w |= *iter++;
        }
        /** Store is correctly vectorized to single instruction */
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }

    __attribute__((noinline))
    static void index_or()
    {
        /** Clang compiles this to same as iterator variant. gcc goes
         * completely insane. I don't even want to try to guess what all the
         * permutation stuff is trying to archive.
         */
        unsigned i;
        uint64_t n(pack[0]),
             e(pack[1]),
             s(pack[2]),
             w(pack[3]);
        for (i = 4 ; i < pack.size(); i+=4) {
            n |= pack[i+0];
            e |= pack[i+1];
            s |= pack[i+2];
            w |= pack[i+3];
        }
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }

    #include "immintrin.h"

    __attribute__((noinline))
    static void expected_result()
    {
        /** Intrinsics implementation what I would expect auto-vectorization
         * transform my c++ code. I simple can't understand why both compilers
         * fails to archive results I expect.
         */
    #if __AVX2__
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        unsigned i;
        __m256i res = _mm256_load_si256(&conv.avx[0]);
        for (i = 1; i < pack.size()/4; i++) {
            __m256i temp = _mm256_load_si256(&conv.avx[i]);
            res = _mm256_or_si256(res, temp);
        }
        conv.mem = board;
        _mm256_store_si256(conv.avx, res);
    #endif
    }

    int main(int c, char **v)
    {
        (void)v;
        expected_init(c - 1);
        init(c - 1);

        iter_or();
        index_or();
        expected_result();
    }

1 个答案:

答案 0 :(得分:1)

似乎gcc和clang都无法从外部循环向量化初始加载。如果首先将代码更改为零临时变量然后使用或从第一个元素更改,则两个编译器都会做得更好。 Clang生成良好的展开向量代码(只有单个ymm寄存器是瓶颈,所有指令都依赖于前一个指令)。 GCC使用额外的初始vpxor生成更糟糕的代码,并且每次迭代执行一个vpor的循环非常糟糕。

我还测试了一些替代实现,其中微基准最好是通过交替寄存器改进的展开代码。

/* only reduce (calling this function from a for loop):
 * ST 7.3 cycles (ST=single thread)
 * SMT 15.3 cycles (SMT=simultaneous multi threading aka hyper threading)
 * shuffle+reduce (calling Fisher-Yatas shuffle and then this function):
 * ST 222 cycles
 * SMT 383 cycles 
 */
    "vmovaps 0x00(%0), %%ymm0\n"
    "vmovaps 0x20(%0), %%ymm1\n"
    "vpor 0x40(%0), %%ymm0, %%ymm0\n"
    "vpor 0x60(%0), %%ymm1, %%ymm1\n"
    "vpor 0x80(%0), %%ymm0, %%ymm0\n"
    "vpor 0xA0(%0), %%ymm1, %%ymm1\n"
    "vpor 0xC0(%0), %%ymm0, %%ymm0\n"
    "vpor 0xE0(%0), %%ymm1, %%ymm1\n"
    "vpor 0x100(%0), %%ymm0, %%ymm0\n"
    "vpor 0x120(%0), %%ymm1, %%ymm1\n"
    "vpor 0x140(%0), %%ymm0, %%ymm0\n"
    "vpor 0x160(%0), %%ymm1, %%ymm1\n"
    "vpor 0x180(%0), %%ymm0, %%ymm0\n"

    "vpor %%ymm0, %%ymm1, %%ymm0\n"
    "vmovaps %%ymm0, 0x00(%1)\n"

Clang展开的循环有像

这样的时间
/* only reduce:
 * ST 9.8 cycles
 * SMT 21.8 cycles
 * shuffle+reduce:
 * ST 223 cycles
 * SMT 385 cycles
 */

但是SMT降低展开代码性能的数字看起来很可疑。我决定尝试更好地编写GCC循环,但仍然明显慢于展开。但后来我决定通过使用两个寄存器并展开循环一次来破坏指令依赖性。这导致了比完全展开更快的shuffle + reduce代码。

size_t end = pack.size() - 3*4;
asm (
/* The best SMT option outside micro optimization.
 * This allows executing two vpor instructions same time and
 * reduces loop count to half with single unroll
 *
 * only reduce:
 * ST 13.0 cycles
 * SMT 20.0 cycles
 * shuffle+reduce:
 * ST 221 cycles
 * SMT 380 cycles
 */
    "vmovaps 0x180(%[pack]), %%ymm0\n"
    "vmovaps 0x160(%[pack]), %%ymm1\n"
    "vpor 0x00(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
    "1:\n"
    "vpor -0x20(%[pack],%[cnt],8), %%ymm1, %%ymm1\n"
    "vpor -0x40(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
    "sub $8, %[cnt]\n"
    "jne 1b\n"

    "vpor %%ymm0, %%ymm1, %%ymm0\n"
    "vmovaps %%ymm0, 0x00(%[out])\n"
    : [cnt]"+r"(end)
    : [pack]"r"(begin), [out]"r"(hands_));

但是,在Fisher-Yates洗牌后运行代码时差异很小。即使gcc版本明显减少仅减少基准(16.4 / 38.8)运行shuffle + reduce测试接近相同速度(228/387)。