Question

我有一个带符号的短数组，我希望除以2048并得到一个浮点数组。

我发现SSE: convert short integer to float允许将 unsigned 短片转换为浮点数，但我也想处理已签名的短片。

以下代码有效但仅适用于正面短片。

// We want to divide some signed short by 2048 and get a float.
const auto floatScale = _mm256_set1_ps(2048);

short* shortsInput = /* values from somewhere */;
float* floatsOutput = /* initialized */;

__m128i* m128iInput = (__m128i*)&shortsInput[0];

// Converts the short vectors to 2 float vectors. This works, but only for positive shorts.
__m128i m128iLow = _mm_unpacklo_epi16(m128iInput[0], _mm_setzero_si128());
__m128i m128iHigh = _mm_unpackhi_epi16(m128iInput[0], _mm_setzero_si128());
__m128 m128Low = _mm_cvtepi32_ps(m128iLow);
__m128 m128High = _mm_cvtepi32_ps(m128iHigh);

// Puts the 2 __m128 vectors into 1 __m256.
__m256 singleComplete = _mm256_castps128_ps256(m128Low);
singleComplete = _mm256_insertf128_ps(singleComplete, m128High, 1);

// Finally do the math
__m256 scaledVect = _mm256_div_ps(singleComplete, floatScale);

// and puts the result where needed.
_mm256_storeu_ps(floatsOutput[0], scaledVect);

如何将已签名的短片转换为浮动？或者也许有更好的方法来解决这个问题？

编辑：我尝试了与非SIMD算法相比的不同答案，在2048阵列上进行10M次，在AMD Ryzen 7 2700上进行~3.2GHz。我使用Visual 15.7.3主要是默认配置：

/permissive- /Yu"stdafx.h" /GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl 
/Fd"x64\Release\vc141.pdb" /Zc:inline /fp:precise /D "NDEBUG" /D "_CONSOLE"
/D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope
/arch:AVX2 /Gd /Oi /MD /openmp /FC /Fa"x64\Release\" /EHsc /nologo
/Fo"x64\Release\" /Fp"x64\Release\test.pch" /diagnostics:classic

请注意，我是SIMD的新手，并且多年没有使用过C ++。这是我得到的（我分别重新测试每个测试，而不是一个接一个，并得到更好的结果）：

没有SIMD：7300ms
wim的回答：2300ms
chtz的SSE2回答：1650ms
chtz的AVX2答案：2100ms

所以我通过使用SIMD得到了一个很好的加速，而chtz的SSE2答案虽然更加冗长和复杂，但速度更快。（至少在使用AVX编译时编译，因此它避免了使用3操作数VEX编码指令复制寄存器的额外指令。在Intel CPU上，AVX2版本应该比128位版本快得多。）

这是我的测试代码：

const int size = 2048;
const int loopSize = (int)1e7;

float* noSimd(short* shortsInput) {
    float* floatsOutput = new float[size];

    auto startTime = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < loopSize; i++) {
        for (int j = 0; j < size; j++) {
            floatsOutput[j] = shortsInput[j] / 2048.0f;
        }
    }

    auto stopTime = std::chrono::high_resolution_clock::now();
    long long totalTime = (stopTime - startTime).count();

    printf("%lld noSimd\n", totalTime);

    return floatsOutput;
}

float* wimMethod(short* shortsInput) {
    const auto floatScale = _mm256_set1_ps(1.0f / 2048.0f);
    float* floatsOutput = new float[size];

    auto startTime = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < loopSize; i++) {
        for (int j = 0; j < size; j += 8) {
            __m128i short_vec = _mm_loadu_si128((__m128i*)&shortsInput[j]);
            __m256i int_vec = _mm256_cvtepi16_epi32(short_vec);
            __m256  singleComplete = _mm256_cvtepi32_ps(int_vec);

            // Finally do the math
            __m256 scaledVect = _mm256_mul_ps(singleComplete, floatScale);

            // and puts the result where needed.
            _mm256_storeu_ps(&floatsOutput[j], scaledVect);
        }
    }

    auto stopTime = std::chrono::high_resolution_clock::now();
    long long totalTime = (stopTime - startTime).count();

    printf("%lld wimMethod\n", totalTime);

    return floatsOutput;
}

float* chtzMethodSSE2(short* shortsInput) {
    float* floatsOutput = new float[size];

    auto startTime = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < loopSize; i++) {
        for (int j = 0; j < size; j += 8) {
            // get input:
            __m128i val = _mm_loadu_si128((__m128i*)&shortsInput[j]);
            // add 0x8000 to wrap to unsigned short domain:
            val = _mm_add_epi16(val, const0x8000);
            // interleave with upper part of float(1<<23)/2048.f:
            __m128i lo = _mm_unpacklo_epi16(val, const0x4580);
            __m128i hi = _mm_unpackhi_epi16(val, const0x4580);
            // interpret as float and subtract float((1<<23) + (0x8000))/2048.f
            __m128 lo_f = _mm_sub_ps(_mm_castsi128_ps(lo), constFloat);
            __m128 hi_f = _mm_sub_ps(_mm_castsi128_ps(hi), constFloat);
            // store:
            _mm_storeu_ps(&floatsOutput[j], lo_f);
            _mm_storeu_ps(&floatsOutput[j] + 4, hi_f);
        }
    }

    auto stopTime = std::chrono::high_resolution_clock::now();
    long long totalTime = (stopTime - startTime).count();

    printf("%lld chtzMethod\n", totalTime);

    return floatsOutput;
}

float* chtzMethodAVX2(short* shortsInput) {
    const auto floatScale = _mm256_set1_ps(1.0f / 2048.0f);
    float* floatsOutput = new float[size];

    auto startTime = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < loopSize; i++) {
        for (int j = 0; j < size; j += 8) {

            // get input:
            __m128i val = _mm_loadu_si128((__m128i*)&shortsInput[j]);
            // interleave with 0x0000
            __m256i val_unpacked = _mm256_cvtepu16_epi32(val);

            // 0x4580'8000
            const __m256 magic = _mm256_set1_ps(float((1 << 23) + (1 << 15)) / 2048.f);
            const __m256i magic_i = _mm256_castps_si256(magic);

            /// convert by xor-ing and subtracting magic value:
            // VPXOR avoids port5 bottlenecks on Intel CPUs before SKL
            __m256 val_f = _mm256_castsi256_ps(_mm256_xor_si256(val_unpacked, magic_i));
            __m256 converted = _mm256_sub_ps(val_f, magic);
            // store:
            _mm256_storeu_ps(&floatsOutput[j], converted);
        }
    }

    auto stopTime = std::chrono::high_resolution_clock::now();
    long long totalTime = (stopTime - startTime).count();

    printf("%lld chtzMethod2\n", totalTime);

    return floatsOutput;
}

Answer 1

您可以通过手动编写浮点数来替换执行转换epi16-＆gt; epi32-＆gt;浮点数并乘以1.f/2048.f的标准方法。

这是因为除数是2的幂，所以手动组成浮点只意味着不同的指数。

感谢@PeterCordes，这里是这个想法的优化AVX2版本，使用XOR设置32位浮点数的高位字节，同时翻转整数值的符号位。 FP SUB将尾数的低位转换为正确的FP值：

// get input:
__m128i val = _mm_loadu_si128((__m128i*)input);
// interleave with 0x0000
__m256i val_unpacked = _mm256_cvtepu16_epi32(val);

// 0x4580'8000
const __m256 magic = _mm256_set1_ps(float((1<<23) + (1<<15))/2048.f);
const __m256i magic_i = _mm256_castps_si256(magic);

/// convert by xor-ing and subtracting magic value:
// VPXOR avoids port5 bottlenecks on Intel CPUs before SKL
__m256 val_f = _mm256_castsi256_ps(_mm256_xor_si256(val_unpacked, magic_i));
__m256 converted = _mm256_sub_ps(val_f, magic);
// store:
_mm256_storeu_ps(output, converted);

见on the Godbolt compiler explorer with gcc and clang;在Skylake i7-6700k上，一个2048元素的循环，在高速缓存中需要大约360个时钟周期，速度（在测量误差范围内）与执行标准符号扩展/转换的@ wim版本相同/ multiply（具有相似数量的循环展开）。由Linux perf的@PeterCordes测试。但在Ryzen上，这可能会明显加快，因为我们避免使用_mm256_cvtepi32_ps（Ryzen每

vcvtdq2ps ymm

http://agner.org/optimize/每2个时钟吞吐量为1个。）

0x8000的xor与下半部相当于加/减0x8000，因为忽略了溢出/进位。巧合的是，这允许使用相同的魔术常数进行异或和减法。

奇怪的是，gcc和clang更喜欢用-magic添加替换减法，这不会重复使用常量...他们更喜欢使用add，因为它是可交换的，但是在这种情况下没有任何好处，因为他们没有使用内存操作数。

这里是一个SSE2版本，它执行有符号/无符号翻转，与设置32位FP位模式的高2字节分开。

我们使用一个_mm_add_epi16，两个_mm_unpackXX_epi16和两个_mm_sub_ps获取8个值（_mm_castsi128_ps是无操作，{{1}将缓存在寄存器中）：

_mm_set

用法演示： https://ideone.com/b8BfJd

如果您的输入未签名，则不需要// get input: __m128i val = _mm_loadu_si128((__m128i*)input); // add 0x8000 to wrap to unsigned short domain: // val = _mm_add_epi16(val, _mm_set1_epi16(0x8000)); val = _mm_xor_si128(val, _mm_set1_epi16(0x8000)); // PXOR runs on more ports, avoids competing with FP add/sub or unpack on Sandybridge/Haswell. // interleave with upper part of float(1<<23)/2048.f: __m128i lo = _mm_unpacklo_epi16(val, _mm_set1_epi16(0x4580)); __m128i hi = _mm_unpackhi_epi16(val, _mm_set1_epi16(0x4580)); // interpret as float and subtract float((1<<23) + (0x8000))/2048.f __m128 lo_f = _mm_sub_ps(_mm_castsi128_ps(lo), _mm_set_ps1(float((1<<23) + (1<<15))/2048.f)); __m128 hi_f = _mm_sub_ps(_mm_castsi128_ps(hi), _mm_set_ps1(float((1<<23) + (1<<15))/2048.f)); // store: _mm_storeu_ps(output, lo_f); _mm_storeu_ps(output+4, hi_f);（并且_mm_add_epi16中的1<<15将被删除，当然）。那么你就可以在SSE: convert short integer to float上找到Marat的答案。

这可以轻松移植到AVX2，每次迭代转换两次，但是必须注意输出元素的顺序（感谢@wim指出这一点）。

此外，对于纯SSE解决方案，可以简单地使用_mm_sub_ps，但这是英特尔库函数。没有单一指令可以做到这一点。

_mm_cvtpi16_ps

我没有对任何解决方案进行基准测试（也没有检查理论吞吐量或延迟）

Answer 2

使用AVX2时，无需单独转换高低音：

const auto floatScale = _mm256_set1_ps(1.0f/2048.0f);

short* shortsInput = /* values from somewhere */;
float* floatsOutput = /* initialized */;

__m128i short_vec = _mm_loadu_si128((__m128i*)shortsInput);
__m256i int_vec =  _mm256_cvtepi16_epi32 (short_vec);
__m256  singleComplete = _mm256_cvtepi32_ps (int_vec);

// Finally do the math
__m256 scaledVect = _mm256_mul_ps(singleComplete, floatScale);

// and puts the result where needed.
_mm256_storeu_ps(floatsOutput, scaledVect);

这很好地编译on the Godbolt compiler explorer，并且在L1d高速缓存和对齐的输入/输出数组中输入/输出热，在Skylake i7-6700k上在~360个时钟周期内转换2048个元素的数组（在重复循环中测试））。每个元素大约0.18个周期，或每个时钟周期大约5.7个转换。或每个载体约1.4个循环，包括商店。它主要是前端吞吐量（每时钟3.75个融合域uop）的瓶颈，即使clang的循环展开也是如此，因为转换是5 uops。

请注意，即使在Haswell / Skylake上使用简单的寻址模式，vpmovsxwd ymm, [mem]也不能微融合到单个uop中，所以在这种情况下，最近gcc / clang变换指针增量到索引寻址是好的单循环计数器。对于大多数内存源向量指令（如vpmovsxwd xmm, [mem]），这将花费额外的uop：Micro fusion and addressing modes。

只需一个加载和一个商店，就可以在Haswell / Skylake的port7商店AGU上运行商店，它只处理非索引的寻址模式。

Intel CPU上的最大吞吐量需要循环展开（如果没有内存瓶颈），因为load + convert + store已经是4 uops。与@ chtz的答案相同。

如果您只需要读取浮点值几次，理想情况下立即使用矢量结果进行进一步计算。它只有3个指令（但是对于隐藏的无序exec确实有一些延迟）。在需要时重做转换可能比使用更大的缓存占用空间来将两倍大的float[]结果存储在内存中更好;这取决于你的用例和硬件。

在C ++ SIMD中将signed short转换为float

2 个答案: