我正在降低GCC / ICC之间的一些数字差异,并且发现一些令人困惑的问题。编译此函数(计算近似自然对数)时:
// convenience macros for defining constant vectors
#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);
#define MM256_FCONST(val) _mm256_set1_ps (val);
__m256 _mm256_logf_app(__m256 xx) {
// constants
const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask
const __m256 exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent
const __m256 mln2 = MM256_FCONST(M_LN2); // log(2)
// polynomial coefficients
const __m256 a = MM256_FCONST(+3.529304993f);
const __m256 b = MM256_FCONST(-2.461222105f);
const __m256 c = MM256_FCONST(+1.130626167f);
const __m256 d = MM256_FCONST(-0.288739945f);
const __m256 e = MM256_FCONST(+3.110401639e-2f);
const __m256 f = MM256_FCONST(-89.970756366f);
// mask out anything <= 0
__m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);
// extract exponents
__m256 exp = _mm256_cvtepi32_ps(
avx2_mm256_srli_epi32((__m256i)xx, 23)
);
// clear exponent to 0 (+127)
xx = _mm256_and_ps(xx, mant_mask);
xx = _mm256_or_ps (xx, exp_mask);
// horner's rule to evaluate polynomial
__m256 ret = e;
ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));
// add in exponent contribution
ret = _mm256_add_ps(ret,
_mm256_mul_ps(exp, mln2)
);
return _mm256_or_ps(ret, invalid);
}
它生成此程序集:
0x00000000006f0f10 <+0>: lea 0x8(%rsp),%r10
0x00000000006f0f15 <+5>: and $0xffffffffffffffe0,%rsp
0x00000000006f0f19 <+9>: pushq -0x8(%r10)
0x00000000006f0f1d <+13>: push %rbp
0x00000000006f0f1e <+14>: mov %rsp,%rbp
0x00000000006f0f21 <+17>: push %r10
0x00000000006f0f23 <+19>: sub $0x68,%rsp
0x00000000006f0f27 <+23>: vmovaps %ymm0,-0x70(%rbp)
0x00000000006f0f2c <+28>: vxorps %xmm0,%xmm0,%xmm0
0x00000000006f0f30 <+32>: vmovaps -0x70(%rbp),%ymm4
0x00000000006f0f35 <+37>: vmovdqa -0x70(%rbp),%xmm1
0x00000000006f0f3a <+42>: vandps 0x3dade(%rip),%ymm4,%ymm8
0x00000000006f0f42 <+50>: vcmple_oqps %ymm0,%ymm4,%ymm3
0x00000000006f0f47 <+55>: vmovdqa -0x60(%rbp),%xmm5
0x00000000006f0f4c <+60>: vpsrld $0x17,%xmm1,%xmm2
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
0x00000000006f0f77 <+103>: vmovaps %xmm6,-0x40(%rbp)
0x00000000006f0f7c <+108>: vcvtdq2ps -0x50(%rbp),%ymm7
0x00000000006f0f81 <+113>: vorps 0x3dab7(%rip),%ymm8,%ymm9
0x00000000006f0f89 <+121>: vmulps 0x3db6f(%rip),%ymm7,%ymm1
0x00000000006f0f91 <+129>: vmulps 0x3dac7(%rip),%ymm9,%ymm10
0x00000000006f0f99 <+137>: vaddps 0x3db7f(%rip),%ymm1,%ymm5
0x00000000006f0fa1 <+145>: vaddps 0x3dad7(%rip),%ymm10,%ymm11
0x00000000006f0fa9 <+153>: vmulps %ymm9,%ymm11,%ymm12
0x00000000006f0fae <+158>: vaddps 0x3daea(%rip),%ymm12,%ymm13
0x00000000006f0fb6 <+166>: vmulps %ymm9,%ymm13,%ymm14
0x00000000006f0fbb <+171>: vaddps 0x3dafd(%rip),%ymm14,%ymm15
0x00000000006f0fc3 <+179>: vmulps %ymm9,%ymm15,%ymm0
0x00000000006f0fc8 <+184>: vaddps 0x3db10(%rip),%ymm0,%ymm4
0x00000000006f0fd0 <+192>: vmulps %ymm9,%ymm4,%ymm2
0x00000000006f0fd5 <+197>: vaddps %ymm5,%ymm2,%ymm6
0x00000000006f0fd9 <+201>: vorps %ymm3,%ymm6,%ymm0
0x00000000006f0fdd <+205>: jne 0x6f0fea <_mm256_logf_app(float __vector(8))+218>
0x00000000006f0fdf <+207>: add $0x68,%rsp
0x00000000006f0fe3 <+211>: pop %rax
0x00000000006f0fe4 <+212>: pop %rbp
0x00000000006f0fe5 <+213>: lea -0x8(%rax),%rsp
0x00000000006f0fe9 <+217>: retq
令人费解的是这件作品:
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
在我看来,这就像访问线程本地存储(fs寄存器,我在x64 linux上)。它从TLS中提取,溢出到堆栈中,然后又与TLS中的某些内容进行异或运算?为什么这样做呢?英特尔没有。这是GCC 7.3
编译为:
> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/ -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o