Question

我一直试图在SSE2内在函数中实现向量移位，但是从实验和the intel intrinsic guide开始，它似乎只使用了向量中最不重要的部分。

要重新提出我的问题，给定一个向量{v1，v2，...，vn}和一组移位{s1，s2，...，sn}，我该如何计算结果{r1，r2， ......，等等：

r1 = v1 << s1
r2 = v2 << s2
...
rn = vn << sn

因为看起来_mm_sll_epi *执行此操作：

r1 = v1 << s1
r2 = v2 << s1
...
rn = vn << s1

提前致谢。

编辑：

这是我的代码：

#include <iostream>

#include <cstdint>

#include <mmintrin.h>
#include <emmintrin.h>

namespace SIMD {

    using namespace std;

    class SSE2 {
    public:
        // flipped operands due to function arguments
        SSE2(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { low = _mm_set_epi64x(b, a); high = _mm_set_epi64x(d, c); }

        uint64_t& operator[](int idx)
        {
            switch (idx) {
            case 0:
                _mm_storel_epi64((__m128i*)result, low);
                return result[0];
            case 1:
                _mm_store_si128((__m128i*)result, low);
                return result[1];
            case 2:
                _mm_storel_epi64((__m128i*)result, high);
                return result[0];
            case 3:
                _mm_store_si128((__m128i*)result, high);
                return result[1];
            }

            /* Undefined behaviour */
            return 0;
        }

        SSE2& operator<<=(const SSE2& rhs)
        {
            low  = _mm_sll_epi64(low,  rhs.getlow());
            high = _mm_sll_epi64(high, rhs.gethigh());

            return *this;
        }

        void print()
        {
            uint64_t a[2];
            _mm_store_si128((__m128i*)a, low);

            cout << hex;
            cout << a[0] << ' ' << a[1] << ' ';

            _mm_store_si128((__m128i*)a, high);

            cout << a[0] << ' ' << a[1] << ' ';
            cout << dec;
        }

        __m128i getlow() const
        {
            return low;
        }

        __m128i gethigh() const
        {
            return high;
        }
    private:
        __m128i low, high;
        uint64_t result[2];
    };
}

int main()
{
    cout << "operator<<= test: vector << vector: ";
    {
        auto x = SIMD::SSE2(7, 8, 15, 10);
        auto y = SIMD::SSE2(4, 5,  6,  7);

        x.print();
        y.print();

        x <<= y;

        if (x[0] != 112 || x[1] != 256 || x[2] != 960 || x[3] != 1280) {
            cout << "FAILED: ";
            x.print();
            cout << endl;
        } else {
            cout << "PASSED" << endl;
        }
    }

    return 0;
}

应该发生什么得到{7＆lt;＆lt; 4 = 112,8 <＆lt; 5 = 256,15 <＆lt; 5 6 = 960,10 <＆lt;＆lt; 7 = 1280}。结果似乎是{7 <＆lt; 4 = 112,8 <＆lt; 4 = 128,15 <＆lt; 4 6 = 960,15 <＆lt; 6 = 640}，这不是我想要的。

希望这有帮助，Jens。

Answer 1

如果AVX2可用，并且您的元素是32位或64位，则您的操作需要一个变量指令：vpsrlvq，（__m128i _mm_srlv_epi64 (__m128i a, __m128i count)）

对于SSE4.1的32位元素，请参阅Shifting 4 integers right by different values SIMD。根据延迟与吞吐量的要求，你可以进行单独的移位然后混合，或者使用乘法（通过特殊构造的2次幂向量）来获得可变计数的左移，然后执行相同的计数-all-elements右移。

对于您的情况，具有运行时变量移位计数的64位元素：

每个SSE向量只有两个元素，所以我们只需要两个移位然后组合结果（我们可以使用pblendw，或者使用浮点movsd（这可能会导致额外的旁路 - 一些CPU上的延迟延迟），或者我们可以使用两个shuffle，或者我们可以做两个AND和一个OR。

__m128i SSE2_emulated_srlv_epi64(__m128i a, __m128i count)
{
    __m128i shift_low = _mm_srl_epi64(a, count);          // high 64 is garbage
    __m128i count_high = _mm_unpackhi_epi64(count,count); // broadcast the high element
    __m128i shift_high = _mm_srl_epi64(a, count_high);    // low 64 is garbage
    // SSE4.1:
    // return _mm_blend_epi16(shift_low, shift_high, 0x0F);

#if 1   // use movsd to blend
    __m128d blended = _mm_move_sd( _mm_castsi128_pd(shift_high), _mm_castsi128_pd(shift_low) );  // use movsd as a blend.  Faster than multiple instructions on most CPUs, but probably bad on Nehalem.
    return _mm_castpd_si128(blended);
#else  // SSE2 without using FP instructions:
    // if we're going to do it this way, we could have shuffled the input before shifting.  Probably not helpful though.
    shift_high = _mm_unpackhi_epi64(shift_high, shift_high);       // broadcast the high64
    return       _mm_unpacklo_epi64(shift_high, shift_low);        // combine
#endif
}

pshufd或psrldq之类的其他shuff可以工作，但是punpckhqdq可以在不需要立即字节的情况下完成工作，所以它缩短了一个字节。 SSSE3 palignr可以将一个寄存器中的高元素和另一个寄存器中的低元素转换为一个向量，但它们会被反转（所以我们需要pshufd来交换高和低一半）。 shufpd可以融合，但没有优势movsd。

有关在两个整数指令之间使用FP指令的潜在旁路延迟延迟的详细信息，请参阅Agner Fog's microarch guide。它可能适用于Intel SnB系列CPU，因为其他FP shuffle是。（是的，movsd xmm1, xmm0在port5的shuffle单元上运行。如果你不需要合并行为，使用movaps或movapd进行reg-reg移动甚至是标量。

这会将（Godbolt与gcc5.3 -O3）编译为

    movdqa  xmm2, xmm0  # tmp97, a
    psrlq   xmm2, xmm1    # tmp97, count
    punpckhqdq      xmm1, xmm1  # tmp99, count
    psrlq   xmm0, xmm1    # tmp100, tmp99
    movsd   xmm0, xmm2    # tmp102, tmp97
    ret

SSE2由向量移位

1 个答案:

对于您的情况，具有运行时变量移位计数的64位元素：