请考虑随机生成的__m256i
向量。是否有比将__m256
除以0
(包括)和将1
(含)之间的浮点向量转换成float(1ull<<32)
更快的精确方法?
这是我到目前为止尝试过的,其中iRand
是输入,ans
是输出:
const __m256 fRand = _mm256_cvtepi32_ps(iRand);
const __m256 normalized = _mm256_div_ps(fRand, _mm256_set1_ps(float(1ull<<32)));
const __m256 ans = _mm256_add_ps(normalized, _mm256_set1_ps(0.5f));
答案 0 :(得分:7)
与使用_mm256_div_ps
vdivps
相当慢,例如在我的Haswell Xeon上,它的延迟时间为18-21个周期,吞吐量为14个周期。新型CPU的BTW性能更好,在Skylake上为11/5,在Ryzen上为10/6。
我的实现也不理想,它不会在输出间隔中输出所有可能的值,跳过许多可表示的浮点数,尤其是接近0。但是至少分布非常均匀。
__m256 __vectorcall randomFloats( __m256i randomBits )
{
// Convert to random float bits
__m256 result = _mm256_castsi256_ps( randomBits );
// Zero out exponent bits, leave random bits in mantissa.
// BTW since the mask value is constexpr, we don't actually need AVX2 instructions for this, it's just easier to code with set1_epi32.
const __m256 mantissaMask = _mm256_castsi256_ps( _mm256_set1_epi32( 0x007FFFFF ) );
result = _mm256_and_ps( result, mantissaMask );
// Set sign + exponent bits to that of 1.0, which is sign=0, exponent=2^0.
const __m256 one = _mm256_set1_ps( 1.0f );
result = _mm256_or_ps( result, one );
// Subtract 1.0. The above algorithm generates floats in range [1..2).
// Can't use bit tricks to generate floats in [0..1) because it would cause them to be distributed very unevenly.
return _mm256_sub_ps( result, one );
}
更新:如果要提高精度,请使用以下版本。但它不再是“最快的”。
__m256 __vectorcall randomFloats_32( __m256i randomBits )
{
// Convert to random float bits
__m256 result = _mm256_castsi256_ps( randomBits );
// Zero out exponent bits, leave random bits in mantissa.
const __m256 mantissaMask = _mm256_castsi256_ps( _mm256_set1_epi32( 0x007FFFFF ) );
result = _mm256_and_ps( result, mantissaMask );
// Set sign + exponent bits to that of 1.0, which is sign=0, exponent = 2^0.
const __m256 one = _mm256_set1_ps( 1.0f );
result = _mm256_or_ps( result, one );
// Subtract 1.0. The above algorithm generates floats in range [1..2).
result = _mm256_sub_ps( result, one );
// Use 9 unused random bits to add extra randomness to the lower bits of the values.
// This increases precision to 2^-32, however most floats in the range can't store that many bits, fmadd will only add them for small enough values.
// If you want uniformly distributed floats with 2^-24 precision, replace the second argument in the following line with _mm256_set1_epi32( 0x80000000 ).
// In this case you don't need to set rounding mode bits in MXCSR.
__m256i extraBits = _mm256_and_si256( randomBits, _mm256_castps_si256( mantissaMask ) );
extraBits = _mm256_srli_epi32( extraBits, 9 );
__m256 extra = _mm256_castsi256_ps( extraBits );
extra = _mm256_or_ps( extra, one );
extra = _mm256_sub_ps( extra, one );
_MM_SET_ROUNDING_MODE( _MM_ROUND_DOWN );
constexpr float mul = 0x1p-23f; // The initial part of the algorithm has generated uniform distribution with the step 2^-23.
return _mm256_fmadd_ps( extra, _mm256_set1_ps( mul ), result );
}
答案 1 :(得分:2)
首先,不进行除法,将其替换为乘法。虽然@Soonts对您可能已经足够了,但我只能指出由于使用了[1 ... 2)区间的映射,它会产生形式为k / 2 −23 的统一二进位有理数,这只是可能产生的一半。我更喜欢S.Vigna中的方法(在底部),形式k / 2 −24 的所有二元有理数都同样可能。
代码,VC ++ 2019,x64,Win10,英特尔i7 Skylake
#include <random>
#include "immintrin.h"
auto p256_dec_u32(__m256i in) -> void {
alignas(alignof(__m256i)) uint32_t v[8];
_mm256_store_si256((__m256i*)v, in);
printf("v8_u32: %u %u %u %u %u %u %u %u\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
}
auto p256_dec_f32(__m256 in) -> void {
alignas(alignof(__m256)) float v[8];
_mm256_store_ps(v, in);
printf("v8_float: %e %e %e %e %e %e %e %e\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
}
auto main() -> int {
const float c = 0x1.0p-24f; // or (1.0f / (uint32_t(1) << 24));
const int N = 1000000;
std::mt19937 rng{ 987654321ULL };
__m256 sum = _mm256_set1_ps(0.0f);
for (int k = 0; k != N; ++k) {
alignas(alignof(__m256i)) uint32_t rnd[8] = { rng(), rng(), rng(), rng(), rng(), rng(), rng(), rng() };
__m256i r = _mm256_load_si256((__m256i*)rnd);
__m256 q = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_srli_epi32(r, 8)), _mm256_set1_ps(c));
sum = _mm256_add_ps(sum, q);
}
sum = _mm256_div_ps(sum, _mm256_set1_ps((float)N)); // computing average
p256_dec_f32(sum);
return 0;
}
有输出
5.002970e-01 4.997833e-01 4.996118e-01 5.004955e-01 5.002163e-01 4.997193e-01 4.996586e-01 5.001499e-01