Question

我正在研究用AVX矢量化的快速atan2近似值：

static inline void vatan2(float* __restrict__ out, const float* __restrict__ in, ssize_t npair) {
    // compute how many iterations to do and remainder of pairs left to do manually
    size_t iters = npair/8;
    size_t rem   = npair-iters*8;

    // constant vectors
    static const uint32_t posnan[8]  = {  0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
    static const uint32_t negnan[8]  = {  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff };
    static const uint32_t signbit[8] = {  0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
    static const float    ones[8]    = {  1,1,1,1,1,1,1,1 };
    static const float    mpi_2[8]   = {  1.57079637,    1.57079637,    1.57079637,    1.57079637,    1.57079637,    1.57079637,    1.57079637,    1.57079637   };
    static const float    mpi[8]     = {  3.14159274,    3.14159274,    3.14159274,    3.14159274,    3.14159274,    3.14159274,    3.14159274,    3.14159274   };
    static const float    coefa[8]   = { -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733 };
    static const float    coefb[8]   = {  0.159314215,   0.159314215,   0.159314215,   0.159314215,   0.159314215,   0.159314215,   0.159314215,   0.159314215  };
    static const float    coefc[8]   = { -0.327622771,  -0.327622771,  -0.327622771,  -0.327622771,  -0.327622771,  -0.327622771,  -0.327622771,  -0.327622771  };

    __asm__(
        // load constants        
        "    vxorps  %%ymm8, %%ymm8, %%ymm8\n\t" // ymm8 = 0
        "    vmovups (%[posnan]), %%ymm9\n\t"    // abs() mask
        "    vmovups (%[coefa]),  %%ymm15\n\t"
        "    vmovups (%[coefb]),  %%ymm14\n\t"
        "    vmovups (%[coefc]),  %%ymm13\n\t" 
        "    vmovups (%[ones]),   %%ymm12\n\t" 
        "    vmovups (%[mpi_2]),  %%ymm11\n\t" 
        "    vmovups (%[mpi]),    %%ymm10\n\t" 

        // setup indices, pointers
        "    mov %[in],  %%rax\n\t" // input pointer
        "    mov %[out], %%rcx\n\t" // output pointer
        "    xor %%r8d,  %%r8d\n\t" // r8 = 0

        ".p2align 4\n\t"
        "LOOP%=:\n\t"
        // load bottom part of ymm0 and ymm1
        "    vmovups     (%%rax), %%xmm0\n\t"
        "    vmovups 0x20(%%rax), %%xmm1\n\t"
        "    add     $0x01,  %%r8\n\t"  // r8  +=  1
        "    add     $0x40,  %%rax\n\t" // in  += 16
        "    add     $0x20,  %%rcx\n\t" // out +=  8 

        // load top part
        "    vinsertf128 $0x1,-0x30(%%rax), %%ymm0, %%ymm0\n\t"
        "    vinsertf128 $0x1,-0x10(%%rax), %%ymm1, %%ymm1\n\t"

        // de-interleave x,y pairs into separate registers
        "    vshufps     $0x88, %%ymm1, %%ymm0, %%ymm3\n\t"
        "    vshufps     $0xdd, %%ymm1, %%ymm0, %%ymm0\n\t"
        "    vperm2f128  $0x03, %%ymm3, %%ymm3, %%ymm2\n\t"
        "    vperm2f128  $0x03, %%ymm0, %%ymm0, %%ymm1\n\t"
        "    vshufps     $0x44, %%ymm2, %%ymm3, %%ymm4\n\t"
        "    vshufps     $0xee, %%ymm2, %%ymm3, %%ymm2\n\t"
        "    vshufps     $0x44, %%ymm1, %%ymm0, %%ymm3\n\t"
        "    vshufps     $0xee, %%ymm1, %%ymm0, %%ymm1\n\t"
        "    vinsertf128 $0x01, %%xmm2, %%ymm4, %%ymm2\n\t"
        "    vinsertf128 $0x01, %%xmm1, %%ymm3, %%ymm3\n\t"

        // absolute values and zero check
        "    vandps      %%ymm9, %%ymm2, %%ymm4\n\t" // abs(x)
        "    vcmpeqps    %%ymm8, %%ymm2, %%ymm0\n\t" // x == 0?
        "    vandps      %%ymm9, %%ymm3, %%ymm6\n\t" // abs(y) 
        "    vcmpeqps    %%ymm8, %%ymm3, %%ymm1\n\t" // y == 0?

        // compute argument a to polynomial
        "    vmaxps      %%ymm4, %%ymm6, %%ymm5\n\t" // max(abs(x), abs(y))
        "    vandps      %%ymm0, %%ymm1, %%ymm1\n\t" // x == 0 && y == 0
        "    vminps      %%ymm4, %%ymm6, %%ymm0\n\t" // min(abs(x), abs(y))
        "    vcmpltps    %%ymm6, %%ymm4, %%ymm4\n\t" // abs(x) < abs(y)
        "    vrcpps      %%ymm5, %%ymm7        \n\t" // compute 1/max(abs(x), abs(y))
        "    vmulps      %%ymm5, %%ymm7, %%ymm5\n\t"  
        "    vcmpltps    %%ymm8, %%ymm2, %%ymm2\n\t" // x < 0

        // compute polynomial
        "    vmulps      %%ymm5, %%ymm7, %%ymm5\n\t"
        "    vaddps      %%ymm7, %%ymm7, %%ymm7\n\t"
        "    vsubps      %%ymm5, %%ymm7, %%ymm7\n\t"
        "    vmulps      %%ymm7, %%ymm0, %%ymm5\n\t"
        "    vmulps      %%ymm5, %%ymm5, %%ymm7\n\t"
        "    vmulps      %%ymm15,%%ymm7, %%ymm0\n\t"
        "    vaddps      %%ymm14,%%ymm0, %%ymm0\n\t"
        "    vmulps      %%ymm7, %%ymm0, %%ymm0\n\t"
        "    vaddps      %%ymm13,%%ymm0, %%ymm0\n\t"
        "    vmulps      %%ymm7, %%ymm0, %%ymm0\n\t"

        // finish up
        "    vxorps      (%[negnan]),%%ymm1,%%ymm7\n\t"
        "    vaddps      %%ymm12,%%ymm0, %%ymm0\n\t"
        "    vandps      %%ymm4, %%ymm7, %%ymm4\n\t"
        "    vandps      %%ymm2, %%ymm7, %%ymm2\n\t"
        "    vmulps      %%ymm5, %%ymm0, %%ymm0\n\t"
        "    vsubps      %%ymm0, %%ymm11,%%ymm5\n\t"
        "    vblendvps   %%ymm4, %%ymm5, %%ymm0, %%ymm0\n\t"
        "    vsubps      %%ymm0, %%ymm10,%%ymm5\n\t"
        "    vblendvps   %%ymm2, %%ymm5, %%ymm0, %%ymm0\n\t"
        "    vcmpleps    %%ymm3, %%ymm8, %%ymm2\n\t"
        "    vxorps      (%[signbit]), %%ymm0, %%ymm4\n\t"
        "    vcmpltps    %%ymm8, %%ymm3, %%ymm3\n\t"
        "    vandps      %%ymm2, %%ymm7, %%ymm2\n\t"
        "    vandps      %%ymm3, %%ymm7, %%ymm7\n\t"
        "    vblendvps   %%ymm1, %%ymm8, %%ymm4, %%ymm1\n\t"
        "    vblendvps   %%ymm7, %%ymm4, %%ymm1, %%ymm1\n\t"
        "    vblendvps   %%ymm2, %%ymm0, %%ymm1, %%ymm1\n\t"

        // store to result
        "    vmovups      %%xmm1,-0x20(%%rcx)\n\t"
        "    vextractf128 $0x1,%%ymm1,-0x10(%%rcx)\n\t"

        // are we done?
        "    cmp    %[iters],%%r8\n\t"
        "    jb     LOOP%=\n\t"
        "    vzeroupper\n\t"
        :
        : [posnan]  "g" (&posnan),  [negnan] "g" (&negnan), [coefa] "g" (&coefa), [coefb] "g"  (&coefb),
          [coefc]   "g" (&coefc),   [ones]   "g" (&ones),   [mpi_2] "g" (&mpi_2), [mpi]   "g"  (&mpi),
          [signbit] "g" (&signbit), [in]     "g" (in),      [out]   "g" (out),    [iters] "er" (iters)
        : MMREG(0), MMREG(1), MMREG(2),  MMREG(3),  MMREG(4),  MMREG(5),  MMREG(6),  MMREG(7),
          MMREG(8), MMREG(9), MMREG(10), MMREG(11), MMREG(12), MMREG(13), MMREG(14), MMREG(15),
          "rax", "rcx", "r8", "memory"
    );

    // finish remainder
    if (rem > 0) {
        in  += iters*16;
        out += iters*8;

        for (size_t ii=0; ii < rem; ii++) {
            out[ii] = fast_atan2(in[2*ii+1], in[2*ii+0]);
        }
    }
}

但是，当我编译时：

g ++ -O3 -ffast-math -mavx -Wall -Wextra -I。 test.cc -g3 -o test

我收到了未定义的符号错误：

./simd.h:169:30: note: loop vectorized
./simd.h:177:27: note: loop vectorized
./simd.h:177:27: note: loop versioned for vectorization because of possible aliasing
/tmp/ccgiCMgT.o: In function `vatan2(float*, float const*, long)':
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::posnan'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::coefa'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::coefb'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::coefc'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::ones'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::mpi_2'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::mpi'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::negnan'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::signbit'

任何想法为什么？

Answer 1

您正在错误地使用约束。根据约束"g"(foo)，gcc可以生成$foo，即立即操作数foo。您的操作数(%[foo])变为($foo)，这是无效的语法，但汇编程序将其选为对不存在的符号$foo的引用（符号为foo，不是$foo）。要解决此问题，请将约束更改为m以立即生成内存操作数：

"    vmovups %[posnan], %%ymm9\n\t"

然后再

"m"(posnan)

内联汇编导致gcc中找不到符号错误

1 个答案: