我正在研究用AVX矢量化的快速atan2近似值:
static inline void vatan2(float* __restrict__ out, const float* __restrict__ in, ssize_t npair) {
// compute how many iterations to do and remainder of pairs left to do manually
size_t iters = npair/8;
size_t rem = npair-iters*8;
// constant vectors
static const uint32_t posnan[8] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
static const uint32_t negnan[8] = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff };
static const uint32_t signbit[8] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
static const float ones[8] = { 1,1,1,1,1,1,1,1 };
static const float mpi_2[8] = { 1.57079637, 1.57079637, 1.57079637, 1.57079637, 1.57079637, 1.57079637, 1.57079637, 1.57079637 };
static const float mpi[8] = { 3.14159274, 3.14159274, 3.14159274, 3.14159274, 3.14159274, 3.14159274, 3.14159274, 3.14159274 };
static const float coefa[8] = { -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733, -0.0464964733 };
static const float coefb[8] = { 0.159314215, 0.159314215, 0.159314215, 0.159314215, 0.159314215, 0.159314215, 0.159314215, 0.159314215 };
static const float coefc[8] = { -0.327622771, -0.327622771, -0.327622771, -0.327622771, -0.327622771, -0.327622771, -0.327622771, -0.327622771 };
__asm__(
// load constants
" vxorps %%ymm8, %%ymm8, %%ymm8\n\t" // ymm8 = 0
" vmovups (%[posnan]), %%ymm9\n\t" // abs() mask
" vmovups (%[coefa]), %%ymm15\n\t"
" vmovups (%[coefb]), %%ymm14\n\t"
" vmovups (%[coefc]), %%ymm13\n\t"
" vmovups (%[ones]), %%ymm12\n\t"
" vmovups (%[mpi_2]), %%ymm11\n\t"
" vmovups (%[mpi]), %%ymm10\n\t"
// setup indices, pointers
" mov %[in], %%rax\n\t" // input pointer
" mov %[out], %%rcx\n\t" // output pointer
" xor %%r8d, %%r8d\n\t" // r8 = 0
".p2align 4\n\t"
"LOOP%=:\n\t"
// load bottom part of ymm0 and ymm1
" vmovups (%%rax), %%xmm0\n\t"
" vmovups 0x20(%%rax), %%xmm1\n\t"
" add $0x01, %%r8\n\t" // r8 += 1
" add $0x40, %%rax\n\t" // in += 16
" add $0x20, %%rcx\n\t" // out += 8
// load top part
" vinsertf128 $0x1,-0x30(%%rax), %%ymm0, %%ymm0\n\t"
" vinsertf128 $0x1,-0x10(%%rax), %%ymm1, %%ymm1\n\t"
// de-interleave x,y pairs into separate registers
" vshufps $0x88, %%ymm1, %%ymm0, %%ymm3\n\t"
" vshufps $0xdd, %%ymm1, %%ymm0, %%ymm0\n\t"
" vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm2\n\t"
" vperm2f128 $0x03, %%ymm0, %%ymm0, %%ymm1\n\t"
" vshufps $0x44, %%ymm2, %%ymm3, %%ymm4\n\t"
" vshufps $0xee, %%ymm2, %%ymm3, %%ymm2\n\t"
" vshufps $0x44, %%ymm1, %%ymm0, %%ymm3\n\t"
" vshufps $0xee, %%ymm1, %%ymm0, %%ymm1\n\t"
" vinsertf128 $0x01, %%xmm2, %%ymm4, %%ymm2\n\t"
" vinsertf128 $0x01, %%xmm1, %%ymm3, %%ymm3\n\t"
// absolute values and zero check
" vandps %%ymm9, %%ymm2, %%ymm4\n\t" // abs(x)
" vcmpeqps %%ymm8, %%ymm2, %%ymm0\n\t" // x == 0?
" vandps %%ymm9, %%ymm3, %%ymm6\n\t" // abs(y)
" vcmpeqps %%ymm8, %%ymm3, %%ymm1\n\t" // y == 0?
// compute argument a to polynomial
" vmaxps %%ymm4, %%ymm6, %%ymm5\n\t" // max(abs(x), abs(y))
" vandps %%ymm0, %%ymm1, %%ymm1\n\t" // x == 0 && y == 0
" vminps %%ymm4, %%ymm6, %%ymm0\n\t" // min(abs(x), abs(y))
" vcmpltps %%ymm6, %%ymm4, %%ymm4\n\t" // abs(x) < abs(y)
" vrcpps %%ymm5, %%ymm7 \n\t" // compute 1/max(abs(x), abs(y))
" vmulps %%ymm5, %%ymm7, %%ymm5\n\t"
" vcmpltps %%ymm8, %%ymm2, %%ymm2\n\t" // x < 0
// compute polynomial
" vmulps %%ymm5, %%ymm7, %%ymm5\n\t"
" vaddps %%ymm7, %%ymm7, %%ymm7\n\t"
" vsubps %%ymm5, %%ymm7, %%ymm7\n\t"
" vmulps %%ymm7, %%ymm0, %%ymm5\n\t"
" vmulps %%ymm5, %%ymm5, %%ymm7\n\t"
" vmulps %%ymm15,%%ymm7, %%ymm0\n\t"
" vaddps %%ymm14,%%ymm0, %%ymm0\n\t"
" vmulps %%ymm7, %%ymm0, %%ymm0\n\t"
" vaddps %%ymm13,%%ymm0, %%ymm0\n\t"
" vmulps %%ymm7, %%ymm0, %%ymm0\n\t"
// finish up
" vxorps (%[negnan]),%%ymm1,%%ymm7\n\t"
" vaddps %%ymm12,%%ymm0, %%ymm0\n\t"
" vandps %%ymm4, %%ymm7, %%ymm4\n\t"
" vandps %%ymm2, %%ymm7, %%ymm2\n\t"
" vmulps %%ymm5, %%ymm0, %%ymm0\n\t"
" vsubps %%ymm0, %%ymm11,%%ymm5\n\t"
" vblendvps %%ymm4, %%ymm5, %%ymm0, %%ymm0\n\t"
" vsubps %%ymm0, %%ymm10,%%ymm5\n\t"
" vblendvps %%ymm2, %%ymm5, %%ymm0, %%ymm0\n\t"
" vcmpleps %%ymm3, %%ymm8, %%ymm2\n\t"
" vxorps (%[signbit]), %%ymm0, %%ymm4\n\t"
" vcmpltps %%ymm8, %%ymm3, %%ymm3\n\t"
" vandps %%ymm2, %%ymm7, %%ymm2\n\t"
" vandps %%ymm3, %%ymm7, %%ymm7\n\t"
" vblendvps %%ymm1, %%ymm8, %%ymm4, %%ymm1\n\t"
" vblendvps %%ymm7, %%ymm4, %%ymm1, %%ymm1\n\t"
" vblendvps %%ymm2, %%ymm0, %%ymm1, %%ymm1\n\t"
// store to result
" vmovups %%xmm1,-0x20(%%rcx)\n\t"
" vextractf128 $0x1,%%ymm1,-0x10(%%rcx)\n\t"
// are we done?
" cmp %[iters],%%r8\n\t"
" jb LOOP%=\n\t"
" vzeroupper\n\t"
:
: [posnan] "g" (&posnan), [negnan] "g" (&negnan), [coefa] "g" (&coefa), [coefb] "g" (&coefb),
[coefc] "g" (&coefc), [ones] "g" (&ones), [mpi_2] "g" (&mpi_2), [mpi] "g" (&mpi),
[signbit] "g" (&signbit), [in] "g" (in), [out] "g" (out), [iters] "er" (iters)
: MMREG(0), MMREG(1), MMREG(2), MMREG(3), MMREG(4), MMREG(5), MMREG(6), MMREG(7),
MMREG(8), MMREG(9), MMREG(10), MMREG(11), MMREG(12), MMREG(13), MMREG(14), MMREG(15),
"rax", "rcx", "r8", "memory"
);
// finish remainder
if (rem > 0) {
in += iters*16;
out += iters*8;
for (size_t ii=0; ii < rem; ii++) {
out[ii] = fast_atan2(in[2*ii+1], in[2*ii+0]);
}
}
}
但是,当我编译时:
g ++ -O3 -ffast-math -mavx -Wall -Wextra -I。 test.cc -g3 -o test
我收到了未定义的符号错误:
./simd.h:169:30: note: loop vectorized
./simd.h:177:27: note: loop vectorized
./simd.h:177:27: note: loop versioned for vectorization because of possible aliasing
/tmp/ccgiCMgT.o: In function `vatan2(float*, float const*, long)':
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::posnan'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::coefa'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::coefb'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::coefc'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::ones'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::mpi_2'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::mpi'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::negnan'
tmp/fastatan/./simd.h:162: undefined reference to `$vatan2(float*, float const*, long)::signbit'
任何想法为什么?
答案 0 :(得分:3)
您正在错误地使用约束。根据约束"g"(foo)
,gcc可以生成$foo
,即立即操作数foo
。您的操作数(%[foo])
变为($foo)
,这是无效的语法,但汇编程序将其选为对不存在的符号$foo
的引用(符号为foo
,不是$foo
)。要解决此问题,请将约束更改为m
以立即生成内存操作数:
" vmovups %[posnan], %%ymm9\n\t"
然后再
"m"(posnan)