SSE2具有在单精度浮点数和32位整数之间转换矢量的指令。
_mm_cvtps_epi32()
_mm_cvtepi32_ps()
但是没有双精度和64位整数的等价物。换句话说,他们失踪了:
_mm_cvtpd_epi64()
_mm_cvtepi64_pd()
似乎AVX也没有它们。
模拟这些内在函数的最有效方法是什么?
答案 0 :(得分:24)
如果您愿意偷工减料,double <-> int64
转换只能在两个说明中完成:
NaN
。double <-> int64_t
,您只关心[-2^51, 2^51]
范围内的值。double <-> uint64_t
,您只关心[0, 2^52)
范围内的值。double - &gt; uint64_t中强>
// Only works for inputs in the range: [0, 2^52)
__m128i double_to_uint64(__m128d x){
x = _mm_add_pd(x, _mm_set1_pd(0x0010000000000000));
return _mm_xor_si128(
_mm_castpd_si128(x),
_mm_castpd_si128(_mm_set1_pd(0x0010000000000000))
);
}
double - &gt;的int64_t 强>
// Only works for inputs in the range: [-2^51, 2^51]
__m128i double_to_int64(__m128d x){
x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
return _mm_sub_epi64(
_mm_castpd_si128(x),
_mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
);
}
uint64_t - &gt;双强>
// Only works for inputs in the range: [0, 2^52)
__m128d uint64_to_double(__m128i x){
x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
}
int64_t - &gt;双强>
// Only works for inputs in the range: [-2^51, 2^51]
__m128d int64_to_double(__m128i x){
x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
}
舍入行为:
double -> uint64_t
转换,舍入在当前舍入模式后正常工作。 (通常是圆形到偶数)double -> int64_t
转换,对于除截断之外的所有模式,舍入将遵循当前舍入模式。如果当前的舍入模式是截断(向零舍入),它实际上将向负无穷大舍入。它是如何运作的?
尽管这个技巧只有2个指令,但它并不完全是不言自明的。
关键是要认识到对于双精度浮点,[2^52, 2^53)
范围内的值具有&#34;二进制位置&#34;就在尾数的最低位之下。换句话说,如果将指数和符号位置零,则尾数将精确地成为整数表示。
要从x
转换double -> uint64_t
,请添加幻数M
,这是2^52
的浮点值。这会将x
放入&#34;标准化&#34;范围[2^52, 2^53)
,方便地舍去小数部分位。
现在剩下的就是删除高12位。通过屏蔽它可以轻松完成。最快的方法是识别那些高12位与M
的高位相同。因此,我们可以通过M
简单地减去或XOR,而不是引入额外的掩码常数。 XOR具有更高的吞吐量。
从uint64_t -> double
转换只是与此过程相反。您将M
的指数位加回。然后通过在浮点数中减去M
来取消规范化数字。
签名的整数转换稍微复杂一些,因为您需要处理2的补码符号扩展。我将这些作为练习留给读者。
相关: A fast method to round a double to a 32-bit int explained
答案 1 :(得分:11)
这个答案是关于64位整数到双倍转换,没有偷工减料。 我们假设整数输入和双输出都是256位宽的AVX寄存器。 考虑两种方法:
int64_to_double_based_on_cvtsi2sd()
:正如对问题的评论中所建议的那样,使用cvtsi2sd
4次以及一些数据改组。
不幸的是,cvtsi2sd
和数据混洗指令都需要执行端口5.这限制了这种方法的性能。
int64_to_double_full_range()
:我们可以使用两次Mysticial的快速转换方法
完整的64位整数范围的精确转换。 64位整数分为32位低位和32位高位
,与此问题的答案类似:How to perform uint32/float conversion with SSE?。
这些部分中的每一个都适合于Mysticial的整数到双重转换。
最后,高部分乘以2 ^ 32并添加到低部分。
签名转换比无符号转换(uint64_to_double_full_range()
)更复杂一些,
因为srai_epi64()
不存在。
代码:
#include <stdio.h>
#include <immintrin.h>
#include <stdint.h>
/*
gcc -O3 -Wall -m64 -mfma -mavx2 -march=broadwell cvt_int_64_double.c
./a.out A
time ./a.out B
time ./a.out C
etc.
*/
inline __m256d uint64_to_double256(__m256i x){ /* Mysticial's fast uint64_to_double. Works for inputs in the range: [0, 2^52) */
x = _mm256_or_si256(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)));
return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
}
inline __m256d int64_to_double256(__m256i x){ /* Mysticial's fast int64_to_double. Works for inputs in the range: (-2^51, 2^51) */
x = _mm256_add_epi64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
}
__m256d int64_to_double_full_range(const __m256i v)
{
__m256i msk_lo =_mm256_set1_epi64x(0xFFFFFFFF);
__m256d cnst2_32_dbl =_mm256_set1_pd(4294967296.0); /* 2^32 */
__m256i v_lo = _mm256_and_si256(v,msk_lo); /* extract the 32 lowest significant bits of v */
__m256i v_hi = _mm256_srli_epi64(v,32); /* 32 most significant bits of v. srai_epi64 doesn't exist */
__m256i v_sign = _mm256_srai_epi32(v,32); /* broadcast sign bit to the 32 most significant bits */
v_hi = _mm256_blend_epi32(v_hi,v_sign,0b10101010); /* restore the correct sign of v_hi */
__m256d v_lo_dbl = int64_to_double256(v_lo); /* v_lo is within specified range of int64_to_double */
__m256d v_hi_dbl = int64_to_double256(v_hi); /* v_hi is within specified range of int64_to_double */
v_hi_dbl = _mm256_mul_pd(cnst2_32_dbl,v_hi_dbl); /* _mm256_mul_pd and _mm256_add_pd may compile to a single fma instruction */
return _mm256_add_pd(v_hi_dbl,v_lo_dbl); /* rounding occurs if the integer doesn't exist as a double */
}
__m256d int64_to_double_based_on_cvtsi2sd(const __m256i v)
{ __m128d zero = _mm_setzero_pd(); /* to avoid uninitialized variables in_mm_cvtsi64_sd */
__m128i v_lo = _mm256_castsi256_si128(v);
__m128i v_hi = _mm256_extracti128_si256(v,1);
__m128d v_0 = _mm_cvtsi64_sd(zero,_mm_cvtsi128_si64(v_lo));
__m128d v_2 = _mm_cvtsi64_sd(zero,_mm_cvtsi128_si64(v_hi));
__m128d v_1 = _mm_cvtsi64_sd(zero,_mm_extract_epi64(v_lo,1));
__m128d v_3 = _mm_cvtsi64_sd(zero,_mm_extract_epi64(v_hi,1));
__m128d v_01 = _mm_unpacklo_pd(v_0,v_1);
__m128d v_23 = _mm_unpacklo_pd(v_2,v_3);
__m256d v_dbl = _mm256_castpd128_pd256(v_01);
v_dbl = _mm256_insertf128_pd(v_dbl,v_23,1);
return v_dbl;
}
__m256d uint64_to_double_full_range(const __m256i v)
{
__m256i msk_lo =_mm256_set1_epi64x(0xFFFFFFFF);
__m256d cnst2_32_dbl =_mm256_set1_pd(4294967296.0); /* 2^32 */
__m256i v_lo = _mm256_and_si256(v,msk_lo); /* extract the 32 lowest significant bits of v */
__m256i v_hi = _mm256_srli_epi64(v,32); /* 32 most significant bits of v */
__m256d v_lo_dbl = uint64_to_double256(v_lo); /* v_lo is within specified range of uint64_to_double */
__m256d v_hi_dbl = uint64_to_double256(v_hi); /* v_hi is within specified range of uint64_to_double */
v_hi_dbl = _mm256_mul_pd(cnst2_32_dbl,v_hi_dbl);
return _mm256_add_pd(v_hi_dbl,v_lo_dbl); /* rounding may occur for inputs >2^52 */
}
int main(int argc, char **argv){
int i;
uint64_t j;
__m256i j_4, j_inc;
__m256d v, v_acc;
double x[4];
char test = argv[1][0];
if (test=='A'){ /* test the conversions for several integer values */
j = 1ull;
printf("\nint64_to_double_full_range\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,-j,j);
v = int64_to_double_full_range(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21li v =%23.1f -v=%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[1],x[2],x[3]);
j = j*7ull;
}
j = 1ull;
printf("\nint64_to_double_based_on_cvtsi2sd\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,-j,j);
v = int64_to_double_based_on_cvtsi2sd(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21li v =%23.1f -v=%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[1],x[2],x[3]);
j = j*7ull;
}
j = 1ull;
printf("\nuint64_to_double_full_range\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,j,j);
v = uint64_to_double_full_range(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21lu v =%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[2],x[3]);
j = j*7ull;
}
}
else{
j_4 = _mm256_set_epi64x(-123,-4004,-312313,-23412731);
j_inc = _mm256_set_epi64x(1,1,1,1);
v_acc = _mm256_setzero_pd();
switch(test){
case 'B' :{
printf("\nLatency int64_to_double_cvtsi2sd()\n"); /* simple test to get a rough idea of the latency of int64_to_double_cvtsi2sd() */
for (i = 0; i<1000000000; i++){
v =int64_to_double_based_on_cvtsi2sd(j_4);
j_4= _mm256_castpd_si256(v); /* cast without conversion, use output as an input in the next step */
}
_mm256_storeu_pd(x,v);
}
break;
case 'C' :{
printf("\nLatency int64_to_double_full_range()\n"); /* simple test to get a rough idea of the latency of int64_to_double_full_range() */
for (i = 0; i<1000000000; i++){
v = int64_to_double_full_range(j_4);
j_4= _mm256_castpd_si256(v);
}
_mm256_storeu_pd(x,v);
}
break;
case 'D' :{
printf("\nThroughput int64_to_double_cvtsi2sd()\n"); /* simple test to get a rough idea of the throughput of int64_to_double_cvtsi2sd() */
for (i = 0; i<1000000000; i++){
j_4 = _mm256_add_epi64(j_4,j_inc); /* each step a different input */
v = int64_to_double_based_on_cvtsi2sd(j_4);
v_acc = _mm256_xor_pd(v,v_acc); /* use somehow the results */
}
_mm256_storeu_pd(x,v_acc);
}
break;
case 'E' :{
printf("\nThroughput int64_to_double_full_range()\n"); /* simple test to get a rough idea of the throughput of int64_to_double_full_range() */
for (i = 0; i<1000000000; i++){
j_4 = _mm256_add_epi64(j_4,j_inc);
v = int64_to_double_full_range(j_4);
v_acc = _mm256_xor_pd(v,v_acc);
}
_mm256_storeu_pd(x,v_acc);
}
break;
default : {}
}
printf("v =%23.1f -v =%23.1f v =%23.1f -v =%23.1f \n",x[0],x[1],x[2],x[3]);
}
return 0;
}
这些功能的实际性能可能取决于周围的代码和cpu的生成。
在intel skylake i5 6500系统上,上述代码中的简单测试B,C,D和E的1e9转换(256位宽)的时序结果:
Latency experiment int64_to_double_based_on_cvtsi2sd() (test B) 5.02 sec.
Latency experiment int64_to_double_full_range() (test C) 3.77 sec.
Throughput experiment int64_to_double_based_on_cvtsi2sd() (test D) 2.82 sec.
Throughput experiment int64_to_double_full_range() (test E) 1.07 sec.
int64_to_double_full_range()
和int64_to_double_based_on_cvtsi2sd()
之间的吞吐量差异大于我的预期。
答案 2 :(得分:1)
感谢@mysticial 和@wim 提供全系列的 i64->f64。我为 Highway SIMD 包装器想出了一个全范围截断 f64->i64。
first version 尝试更改舍入模式,但 Clang 重新排序它们并忽略 asm volatile、memory/cc clobbers,甚至原子栅栏。我不清楚如何确保安全。 NOINLINE 有效,但会导致大量溢出。
第二个版本 (Compiler Explorer link) 模拟 FP 重整化,并且根据 llvm-mca 证明速度更快(8-10 个周期 rthroughput/total)。
// Full-range F64 -> I64 conversion
#include <hwy/highway.h>
namespace hwy {
namespace HWY_NAMESPACE {
HWY_API Vec256<int64_t> I64FromF64(Full256<int64_t> di, const Vec256<double> v) {
const RebindToFloat<decltype(di)> dd;
using VD = decltype(v);
using VI = decltype(Zero(di));
const VI k0 = Zero(di);
const VI k1 = Set(di, 1);
const VI k51 = Set(di, 51);
// Exponent indicates whether the number can be represented as int64_t.
const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
const VI exp = biased_exp - Set(di, 0x3FF);
const auto in_range = exp < Set(di, 63);
// If we were to cap the exponent at 51 and add 2^52, the number would be in
// [2^52, 2^53) and mantissa bits could be read out directly. We need to
// round-to-0 (truncate), but changing rounding mode in MXCSR hits a
// compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
// manually shift the mantissa into place (we already have many of the
// inputs anyway).
const VI shift_mnt = Max(k51 - exp, k0);
const VI shift_int = Max(exp - k51, k0);
const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
// Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
// For inputs larger than 2^52, insert zeros at the bottom.
const VI shifted = int52 << shift_int;
// Restore the one bit lost when shifting in the implicit 1-bit.
const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
// Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
const VI sign_mask = BroadcastSignBit(BitCast(di, v));
const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
const VI magnitude = IfThenElse(in_range, restored, limit);
// If the input was negative, negate the integer (two's complement).
return (magnitude ^ sign_mask) - sign_mask;
}
void Test(const double* pd, int64_t* pi) {
Full256<int64_t> di;
Full256<double> dd;
for (size_t i = 0; i < 256; i += Lanes(di)) {
Store(I64FromF64(di, Load(dd, pd + i)), di, pi + i);
}
}
}
}
如果有人看到任何简化算法的潜力,请发表评论。