在SSE 16位向量中去交织图像信道

时间:2016-03-09 16:53:52

标签: x86 sse simd intrinsics sse2

字节我有32 bpp图像。我需要在不同的16位向量中交错R G B颜色通道我正在使用以下代码来执行此操作(how to deinterleave image channel in SSE

  // deinterleave chaneel R, G, B ,A in 16 bits vectors
  {
     __m128i vrgba = _mm_loadu_si128((__m128i *)(pSrc));
     __m128i vr1 = _mm_and_si128(vrgba, _mm_set1_epi32(0xff));
     __m128i vg1 = _mm_and_si128(_mm_srli_epi32(vrgba, 8), _mm_set1_epi32(0xff));
     __m128i vb1 = _mm_and_si128(_mm_srli_epi32(vrgba, 16), _mm_set1_epi32(0xff));
     __m128i va1 = _mm_srli_epi32(vrgba, 24);

     vrgba = _mm_loadu_si128((__m128i *)(pSrc + 4));  // since pSrc is uint32_t type
     __m128i vr2 = _mm_and_si128(vrgba, _mm_set1_epi32(0xff));
     __m128i vg2 = _mm_and_si128(_mm_srli_epi32(vrgba, 8), _mm_set1_epi32(0xff));
     __m128i vb2 = _mm_and_si128(_mm_srli_epi32(vrgba, 16), _mm_set1_epi32(0xff));
     __m128i va2 = _mm_srli_epi32(vrgba, 24);

     vr = _mm_packs_epi32(vr1, vr2);
     vg = _mm_packs_epi32(vg1, vg2);
     vb = _mm_packs_epi32(vb1, vb2);
     va = _mm_packs_epi32(va1, va2);
  }

我们可以提高效率吗?下面是高斯没有去交织信道的代码。我发现它非常低效

    static inline void ConvertTo16Bits(__m128i& v1, __m128i& v2, const __m128i& v0)
    {
        __m128i const zero = _mm_setzero_si128();
        v1 = _mm_unpacklo_epi8(v0, zero);
        v2 = _mm_unpackhi_epi8(v0, zero);
    }

    static inline void mul32bits(__m128i &vh, __m128i &vl,           // output - 2x4xint32_t
        const __m128i& v0, const __m128i& v1) // input  - 2x8xint16_t
    {
        const __m128i vhi = _mm_mulhi_epu16(v0, v1);
        const __m128i vlo = _mm_mullo_epi16(v0, v1);
        vh = _mm_unpacklo_epi16(vlo, vhi);
        vl = _mm_unpackhi_epi16(vlo, vhi);
    }

    struct Pixel
    {
        unsigned char r;
        unsigned char g;
        unsigned char b;
        unsigned char a;
    };

    void computePixelvalue(unsigned int * pixelArray, int count, unsigned short * gaussArray, Pixel& out)
    {
        __m128i sumRGBA;
        sumRGBA = _mm_set1_epi32(0);
        unsigned int countMod4 = count % 4;
        unsigned int b, g, r, a;
        constexpr int shuffle = _MM_SHUFFLE(3, 1, 0, 0);

        while (count >= 4)
        {
            __m128i vrgba = _mm_loadu_si128((__m128i *)(pixelArray));
            __m128i rgba12, rgba34;

            ConvertTo16Bits(rgba12, rgba34, vrgba);

            unsigned short s1 = *gaussArray++;
            unsigned short s2 = *gaussArray++;

            __m128i shift8 = _mm_set1_epi16(s1);
            __m128i shift16 = _mm_set1_epi16(s2);
            __m128i gaussVector = _mm_shuffle_epi32(_mm_unpacklo_epi32(shift8, shift16), shuffle);

            __m128i multl, multh;
            mul32bits(multl, multh, rgba12, gaussVector);
            sumRGBA = _mm_add_epi32(sumRGBA, multl);
            sumRGBA = _mm_add_epi32(sumRGBA, multh);

            s1 = *gaussArray++;
            s2 = *gaussArray++;
            shift8 = _mm_set1_epi16(s1);
            shift16 = _mm_set1_epi16(s2);
            gaussVector = _mm_shuffle_epi32(_mm_unpacklo_epi32(shift8, shift16), shuffle);

            mul32bits(multl, multh, rgba34, gaussVector);
            sumRGBA = _mm_add_epi32(sumRGBA, multl);
            sumRGBA = _mm_add_epi32(sumRGBA, multh);

            count = count - 4;
            pixelArray = pixelArray + 4;
        }

        r = sumRGBA.m128i_u32[0];
        g = sumRGBA.m128i_u32[1];
        b = sumRGBA.m128i_u32[2];
        a = sumRGBA.m128i_u32[3];

        while (countMod4)
        {
            auto pixelArrayByte = reinterpret_cast<unsigned char*>(pixelArray);

            unsigned short k = static_cast<unsigned short>(*gaussArray++);
            r += *pixelArrayByte++ * k;
            g += *pixelArrayByte++ * k;
            b += *pixelArrayByte++ * k;
            a += *pixelArrayByte++ * k;

            countMod4--;
        }

        out.r = static_cast<unsigned char>(r >> 15);
        out.g = static_cast<unsigned char>(g >> 15);
        out.b = static_cast<unsigned char>(b >> 15);
        out.a = static_cast<unsigned char>(a >> 15);
    }

1 个答案:

答案 0 :(得分:3)

pshufb的{​​{1}}个向量转换为{ a b g r ... }的向量(每个源向量一个pshufb)。

两个混洗的源向量之间的

{ a a a a b b b b g g g g r r r r }得到punpckldq{ g2g2g2g2 g1g1g1g1 r2r2r2r2 r1r1r1r1 }低半部分,用零解包高半部分,得到只有g和r的向量。

同样地,pmovzxbw获得punpckhdq相同的两个源向量。

所以每4个输入向量(产生8个输出向量),那就是:

  • 4x pshufb(全部使用相同的控制掩码)
  • 2x punpckh / l dq
  • 4x punpckh / l bw(或用pmovzxbw替换其中2个)

总共10条ALU指令,不包括任何复制,以避免破坏仍然需要的数据。

这与掩码/移位/包方法所需的32条指令相比非常好。 (并且没有AVX,这将涉及相当多的复制,以4种不同的方式屏蔽相同的向量。)这些指令中的8个是{ a2a2a2a2 a1a1a1a1 b2b2b2b2 b1b1b1b1 }随机指令,因此它在shuffle端口上的压力稍微小一点,以换取更多的总指示。

Haswell只能在一个执行端口上进行随机播放,这个端口与位移不同。 (并且pack可以在三个向量执行端口中的任何一个上运行)。我非常有信心10次洗牌方式将获得相当大的优势,因为更多的计算可以与之重叠。

_mm_and可能作为来自两个源向量的shuffle有用,但它具有32位粒度,所以我看不到它的用途。在Intel SnB系列和AMD Bulldozer系列中,在整数向量指令之间使用它不会受到任何惩罚。

另一个想法:

shufps

__m128i rgba1 = _mm_loadu_si128((__m128i *)(pSrc)); // { a1.4 b1.4 g1.4 r1.4 ... a1.1 b1.1 g1.1 r1.1 } __m128i rgba2 = _mm_loadu_si128((__m128i *)(pSrc+4)); // { a2.4 b2.4 ... g2.1 r2.1 } __m128i rg1 = _mm_and_si128 (rgba1, _mm_set1_epi32(0xffff)); __m128i rg2 = _mm_slli_epi32(rgba2, 16); __m128i rg_interleaved = _mm_or_si128(rg2, rg1); // { g2.4 r2.4 g1.4 r1.4 ... g2.1 r2.1 g1.1 r1.1 } 分隔为零扩展的16位r和g向量与另一个rg_interleaved_mm_and_si128