如何使用SSE / AVX高效执行double / int64转换?

时间:2016-12-14 14:09:27

标签: c++ performance floating-point sse avx

SSE2具有在单精度浮点数和32位整数之间转换矢量的指令。

  • _mm_cvtps_epi32()
  • _mm_cvtepi32_ps()

但是没有双精度和64位整数的等价物。换句话说,他们失踪了:

  • _mm_cvtpd_epi64()
  • _mm_cvtepi64_pd()

似乎AVX也没有它们。

模拟这些内在函数的最有效方法是什么?

3 个答案:

答案 0 :(得分:24)

如果您愿意偷工减料,double <-> int64转换只能在两个说明中完成:

  • 如果您不关心无限或NaN
  • 对于double <-> int64_t,您只关心[-2^51, 2^51]范围内的值。
  • 对于double <-> uint64_t,您只关心[0, 2^52)范围内的值。

double - &gt; uint64_t中

//  Only works for inputs in the range: [0, 2^52)
__m128i double_to_uint64(__m128d x){
    x = _mm_add_pd(x, _mm_set1_pd(0x0010000000000000));
    return _mm_xor_si128(
        _mm_castpd_si128(x),
        _mm_castpd_si128(_mm_set1_pd(0x0010000000000000))
    );
}

double - &gt;的int64_t

//  Only works for inputs in the range: [-2^51, 2^51]
__m128i double_to_int64(__m128d x){
    x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
    return _mm_sub_epi64(
        _mm_castpd_si128(x),
        _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
    );
}

uint64_t - &gt;双

//  Only works for inputs in the range: [0, 2^52)
__m128d uint64_to_double(__m128i x){
    x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
    return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
}

int64_t - &gt;双

//  Only works for inputs in the range: [-2^51, 2^51]
__m128d int64_to_double(__m128i x){
    x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
    return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
}

舍入行为:

  • 对于double -> uint64_t转换,舍入在当前舍入模式后正常工作。 (通常是圆形到偶数)
  • 对于double -> int64_t转换,对于除截断之外的所有模式,舍入将遵循当前舍入模式。如果当前的舍入模式是截断(向零舍入),它实际上将向负无穷大舍入。

它是如何运作的?

尽管这个技巧只有2个指令,但它并不完全是不言自明的。

关键是要认识到对于双精度浮点,[2^52, 2^53)范围内的值具有&#34;二进制位置&#34;就在尾数的最低位之下。换句话说,如果将指数和符号位置零,则尾数将精确地成为整数表示。

要从x转换double -> uint64_t,请添加幻数M,这是2^52的浮点值。这会将x放入&#34;标准化&#34;范围[2^52, 2^53),方便地舍去小数部分位。

现在剩下的就是删除高12位。通过屏蔽它可以轻松完成。最快的方法是识别那些高12位与M的高位相同。因此,我们可以通过M简单地减去或XOR,而不是引入额外的掩码常数。 XOR具有更高的吞吐量。

uint64_t -> double转换只是与此过程相反。您将M的指数位加回。然后通过在浮点数中减去M来取消规范化数字。

签名的整数转换稍微复杂一些,因为您需要处理2的补码符号扩展。我将这些作为练习留给读者。

相关: A fast method to round a double to a 32-bit int explained

答案 1 :(得分:11)

这个答案是关于64位整数到双倍转换,没有偷工减料。 我们假设整数输入和双输出都是256位宽的AVX寄存器。 考虑两种方法:

  1. int64_to_double_based_on_cvtsi2sd():正如对问题的评论中所建议的那样,使用cvtsi2sd 4次以及一些数据改组。 不幸的是,cvtsi2sd和数据混洗指令都需要执行端口5.这限制了这种方法的性能。

  2. int64_to_double_full_range():我们可以使用两次Mysticial的快速​​转换方法 完整的64位整数范围的精确转换。 64位整数分为32位低位和32位高位 ,与此问题的答案类似:How to perform uint32/float conversion with SSE?。 这些部分中的每一个都适合于Mysticial的整数到双重转换。 最后,高部分乘以2 ^ 32并添加到低部分。 签名转换比无符号转换(uint64_to_double_full_range())更复杂一些, 因为srai_epi64()不存在。

  3. 代码:

    #include <stdio.h>
    #include <immintrin.h>
    #include <stdint.h>
    
    /* 
    gcc -O3 -Wall -m64 -mfma -mavx2 -march=broadwell cvt_int_64_double.c
    ./a.out A
    time ./a.out B
    time ./a.out C
    etc.
    */
    
    
    inline __m256d uint64_to_double256(__m256i x){                  /*  Mysticial's fast uint64_to_double. Works for inputs in the range: [0, 2^52)     */
        x = _mm256_or_si256(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)));
        return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
    }
    
    inline __m256d int64_to_double256(__m256i x){                   /*  Mysticial's fast int64_to_double. Works for inputs in the range: (-2^51, 2^51)  */
        x = _mm256_add_epi64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
        return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
    }
    
    
    __m256d int64_to_double_full_range(const __m256i v)
    {
        __m256i msk_lo       =_mm256_set1_epi64x(0xFFFFFFFF);
        __m256d cnst2_32_dbl =_mm256_set1_pd(4294967296.0);                 /* 2^32                                                                    */
    
        __m256i v_lo         = _mm256_and_si256(v,msk_lo);                  /* extract the 32 lowest significant bits of v                             */
        __m256i v_hi         = _mm256_srli_epi64(v,32);                     /* 32 most significant bits of v. srai_epi64 doesn't exist                 */
        __m256i v_sign       = _mm256_srai_epi32(v,32);                     /* broadcast sign bit to the 32 most significant bits                      */
                v_hi         = _mm256_blend_epi32(v_hi,v_sign,0b10101010);  /* restore the correct sign of v_hi                                        */
        __m256d v_lo_dbl     = int64_to_double256(v_lo);                    /* v_lo is within specified range of int64_to_double                       */ 
        __m256d v_hi_dbl     = int64_to_double256(v_hi);                    /* v_hi is within specified range of int64_to_double                       */ 
                v_hi_dbl     = _mm256_mul_pd(cnst2_32_dbl,v_hi_dbl);        /* _mm256_mul_pd and _mm256_add_pd may compile to a single fma instruction */
        return _mm256_add_pd(v_hi_dbl,v_lo_dbl);                            /* rounding occurs if the integer doesn't exist as a double                */   
    }
    
    
    __m256d int64_to_double_based_on_cvtsi2sd(const __m256i v)
    {   __m128d zero         = _mm_setzero_pd();                            /* to avoid uninitialized variables in_mm_cvtsi64_sd                       */
        __m128i v_lo         = _mm256_castsi256_si128(v);
        __m128i v_hi         = _mm256_extracti128_si256(v,1);
        __m128d v_0          = _mm_cvtsi64_sd(zero,_mm_cvtsi128_si64(v_lo));
        __m128d v_2          = _mm_cvtsi64_sd(zero,_mm_cvtsi128_si64(v_hi));
        __m128d v_1          = _mm_cvtsi64_sd(zero,_mm_extract_epi64(v_lo,1));
        __m128d v_3          = _mm_cvtsi64_sd(zero,_mm_extract_epi64(v_hi,1));
        __m128d v_01         = _mm_unpacklo_pd(v_0,v_1);
        __m128d v_23         = _mm_unpacklo_pd(v_2,v_3);
        __m256d v_dbl        = _mm256_castpd128_pd256(v_01);
                v_dbl        = _mm256_insertf128_pd(v_dbl,v_23,1);
        return v_dbl;
    }
    
    
    __m256d uint64_to_double_full_range(const __m256i v)                    
    {
        __m256i msk_lo       =_mm256_set1_epi64x(0xFFFFFFFF);
        __m256d cnst2_32_dbl =_mm256_set1_pd(4294967296.0);                 /* 2^32                                                                    */
    
        __m256i v_lo         = _mm256_and_si256(v,msk_lo);                  /* extract the 32 lowest significant bits of v                             */
        __m256i v_hi         = _mm256_srli_epi64(v,32);                     /* 32 most significant bits of v                                           */
        __m256d v_lo_dbl     = uint64_to_double256(v_lo);                   /* v_lo is within specified range of uint64_to_double                      */ 
        __m256d v_hi_dbl     = uint64_to_double256(v_hi);                   /* v_hi is within specified range of uint64_to_double                      */ 
                v_hi_dbl     = _mm256_mul_pd(cnst2_32_dbl,v_hi_dbl);        
        return _mm256_add_pd(v_hi_dbl,v_lo_dbl);                            /* rounding may occur for inputs >2^52                                     */ 
    }
    
    
    
    int main(int argc, char **argv){
      int i;
      uint64_t j;
      __m256i j_4, j_inc;
      __m256d v, v_acc;
      double x[4];
      char test = argv[1][0];
    
      if (test=='A'){               /* test the conversions for several integer values                                       */
        j = 1ull;
        printf("\nint64_to_double_full_range\n");
        for (i = 0; i<30; i++){
          j_4= _mm256_set_epi64x(j-3,j+3,-j,j);
          v  = int64_to_double_full_range(j_4);
          _mm256_storeu_pd(x,v);
          printf("j =%21li    v =%23.1f    -v=%23.1f    v+3=%23.1f    v-3=%23.1f  \n",j,x[0],x[1],x[2],x[3]);
          j  = j*7ull;
        }
    
        j = 1ull;
        printf("\nint64_to_double_based_on_cvtsi2sd\n");
        for (i = 0; i<30; i++){
          j_4= _mm256_set_epi64x(j-3,j+3,-j,j);
          v  = int64_to_double_based_on_cvtsi2sd(j_4);
          _mm256_storeu_pd(x,v);
          printf("j =%21li    v =%23.1f    -v=%23.1f    v+3=%23.1f    v-3=%23.1f  \n",j,x[0],x[1],x[2],x[3]);
          j  = j*7ull;
        }
    
        j = 1ull;                       
        printf("\nuint64_to_double_full_range\n");
        for (i = 0; i<30; i++){
          j_4= _mm256_set_epi64x(j-3,j+3,j,j);
          v  = uint64_to_double_full_range(j_4);
          _mm256_storeu_pd(x,v);
          printf("j =%21lu    v =%23.1f   v+3=%23.1f    v-3=%23.1f \n",j,x[0],x[2],x[3]);
          j  = j*7ull;    
        }
      }
      else{
        j_4   = _mm256_set_epi64x(-123,-4004,-312313,-23412731);  
        j_inc = _mm256_set_epi64x(1,1,1,1);  
        v_acc = _mm256_setzero_pd();
        switch(test){
    
          case 'B' :{                  
            printf("\nLatency int64_to_double_cvtsi2sd()\n");      /* simple test to get a rough idea of the latency of int64_to_double_cvtsi2sd()     */
            for (i = 0; i<1000000000; i++){
              v  =int64_to_double_based_on_cvtsi2sd(j_4);
              j_4= _mm256_castpd_si256(v);                         /* cast without conversion, use output as an input in the next step                 */
            }
            _mm256_storeu_pd(x,v);
          }
          break;
    
          case 'C' :{                  
            printf("\nLatency int64_to_double_full_range()\n");    /* simple test to get a rough idea of the latency of int64_to_double_full_range()    */
            for (i = 0; i<1000000000; i++){
              v  = int64_to_double_full_range(j_4);
              j_4= _mm256_castpd_si256(v);
            }
            _mm256_storeu_pd(x,v);
          }
          break;
    
          case 'D' :{                  
            printf("\nThroughput int64_to_double_cvtsi2sd()\n");   /* simple test to get a rough idea of the throughput of int64_to_double_cvtsi2sd()   */
            for (i = 0; i<1000000000; i++){
              j_4   = _mm256_add_epi64(j_4,j_inc);                 /* each step a different input                                                       */
              v     = int64_to_double_based_on_cvtsi2sd(j_4);
              v_acc = _mm256_xor_pd(v,v_acc);                      /* use somehow the results                                                           */
            }
            _mm256_storeu_pd(x,v_acc);
          }
          break;
    
          case 'E' :{                  
            printf("\nThroughput int64_to_double_full_range()\n"); /* simple test to get a rough idea of the throughput of int64_to_double_full_range() */
            for (i = 0; i<1000000000; i++){
              j_4   = _mm256_add_epi64(j_4,j_inc);  
              v     = int64_to_double_full_range(j_4);
              v_acc = _mm256_xor_pd(v,v_acc);           
            }    
            _mm256_storeu_pd(x,v_acc);
          }
          break;
    
          default : {}
        }  
        printf("v =%23.1f    -v =%23.1f    v =%23.1f    -v =%23.1f  \n",x[0],x[1],x[2],x[3]);
      }
    
      return 0;
    }
    

    这些功能的实际性能可能取决于周围的代码和cpu的生成。

    在intel skylake i5 6500系统上,上述代码中的简单测试B,C,D和E的1e9转换(256位宽)的时序结果:

    Latency experiment int64_to_double_based_on_cvtsi2sd()      (test B)  5.02 sec.
    Latency experiment int64_to_double_full_range()             (test C)  3.77 sec.
    Throughput experiment int64_to_double_based_on_cvtsi2sd()   (test D)  2.82 sec.
    Throughput experiment int64_to_double_full_range()          (test E)  1.07 sec.
    

    int64_to_double_full_range()int64_to_double_based_on_cvtsi2sd()之间的吞吐量差异大于我的预期。

答案 2 :(得分:1)

感谢@mysticial 和@wim 提供全系列的 i64->f64。我为 Highway SIMD 包装器想出了一个全范围截断 f64->i64。

first version 尝试更改舍入模式,但 Clang 重新排序它们并忽略 asm volatile、memory/cc clobbers,甚至原子栅栏。我不清楚如何确保安全。 NOINLINE 有效,但会导致大量溢出。

第二个版本 (Compiler Explorer link) 模拟 FP 重整化,并且根据 llvm-mca 证明速度更快(8-10 个周期 rthroughput/total)。

// Full-range F64 -> I64 conversion
#include <hwy/highway.h>

namespace hwy {
namespace HWY_NAMESPACE {

HWY_API Vec256<int64_t> I64FromF64(Full256<int64_t> di, const Vec256<double> v) {
  const RebindToFloat<decltype(di)> dd;
  using VD = decltype(v);
  using VI = decltype(Zero(di));

  const VI k0 = Zero(di);
  const VI k1 = Set(di, 1);
  const VI k51 = Set(di, 51);

  // Exponent indicates whether the number can be represented as int64_t.
  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
  const VI exp = biased_exp - Set(di, 0x3FF);
  const auto in_range = exp < Set(di, 63);

  // If we were to cap the exponent at 51 and add 2^52, the number would be in
  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
  // manually shift the mantissa into place (we already have many of the
  // inputs anyway).
  const VI shift_mnt = Max(k51 - exp, k0);
  const VI shift_int = Max(exp - k51, k0);
  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
  // For inputs larger than 2^52, insert zeros at the bottom.
  const VI shifted = int52 << shift_int;
  // Restore the one bit lost when shifting in the implicit 1-bit.
  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));

  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
  const VI magnitude = IfThenElse(in_range, restored, limit);

  // If the input was negative, negate the integer (two's complement).
  return (magnitude ^ sign_mask) - sign_mask;
}

void Test(const double* pd, int64_t* pi) {
    Full256<int64_t> di;
    Full256<double> dd;
    for (size_t i = 0; i < 256; i += Lanes(di)) {
      Store(I64FromF64(di, Load(dd, pd + i)), di, pi + i);
    }
}

}
}

如果有人看到任何简化算法的潜力,请发表评论。