Question

我正在尝试使用SIMD指令将10位像素打包到连续字节流中。下面的代码“原则上”，但SIMD版本比标量版本慢。

问题似乎是我无法找到有效加载寄存器的良好聚集/分散操作。

有任何改进建议吗？

// SIMD_test.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

#include "Windows.h"
#include <tmmintrin.h>
#include <stdint.h>
#include <string.h>

// reference non-SIMD implementation that "works"
// 4 uint16 at a time as input, and 5 uint8 as output per loop iteration

void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    for(uint32_t j=0;j<NCOL;j+=4)
    {
        streamBuffer[0] = (uint8_t)(ptr[0]);
        streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2));
        streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4));
        streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6));
        streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ;
        streamBuffer += 5;
        ptr += 4;
    }
}


// poorly written SIMD implementation. Attempts to do the same
// as the packSlow, but 8 iterations at a time

void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
    const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F);
    const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F);
    const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03);

    for(uint32_t j=0;j<NCOL;j+=4*8)
    {
        _mm_prefetch((const char*)(ptr+j),_MM_HINT_T0);
    }

    for(uint32_t j=0;j<NCOL;j+=4*8)
    {
        // this "fetch" stage is costly. Each term takes 2 cycles
        __m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]);
        __m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]);
        __m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]);
        __m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]);

           // I think this part is fairly well optimized
        __m128i streamBuffer0 =  ptr0;
        __m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2)));
        __m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4)));
        __m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6)));
        __m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ;

        // this again is terribly slow. ~2 cycles per byte output
        for(int j=15;j>=0;j-=2)
        {
            streamBuffer[0] = streamBuffer0.m128i_u8[j];
            streamBuffer[1] = streamBuffer1.m128i_u8[j];
            streamBuffer[2] = streamBuffer2.m128i_u8[j];
            streamBuffer[3] = streamBuffer3.m128i_u8[j];
            streamBuffer[4] = streamBuffer4.m128i_u8[j];
            streamBuffer += 5;
        }
        ptr += 32;
    }

}

int _tmain(int argc, _TCHAR* argv[])
{

    uint16_t pixels[512];
    uint8_t packed1[512*10/8];
    uint8_t packed2[512*10/8];

    for(int i=0;i<512;i++)
    {
        pixels[i] = i;
    }

    LARGE_INTEGER t0,t1,t2;

    QueryPerformanceCounter(&t0);
    for(int k=0;k<1000;k++) packSlow(pixels,packed1,512);
    QueryPerformanceCounter(&t1);
    for(int k=0;k<1000;k++) packFast(pixels,packed2,512);
    QueryPerformanceCounter(&t2);

    printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart);

    if (memcmp(packed1,packed2,sizeof(packed1)))
    {
        printf("failed\n");
    }


    return 0;
}

Answer 1

在重新阅读你的代码时，看起来你几乎肯定是在谋杀你的加载/存储单元，这甚至不能完全缓解新的AVX2 VGATHER[D/Q]P[D/S]指令系列。即使Haswell的架构仍然需要每个负载元件的uop，每个都能够在不同地点的情况下达到L1D TLB和缓存，并且Skylake ca. 2016年最早。

您目前最好的办法是进行16B寄存器读取，并使用寄存器副本，streamBuffer和_mm_shuffle_epi8()调用手动构建_mm_or_si128()值，并使用精加工店。

在不久的将来，AVX2将提供（并且已经用于更新的桌面）VPS[LL/RL/RA]V[D/Q]指令，允许可变元素移位，结合水平添加，可以非常快速地完成此包装。在这种情况下，您可以使用简单的MOVDQU指令来加载值，因为您可以在单个xmm寄存器中处理连续的uint16_t输入值。

另外，请考虑重新处理预取。您j循环中的NCOL一次处理64B / 1缓存行，因此您可能应该在第二个循环体的开头为ptr + 32执行一次预取。您甚至可以考虑省略它，因为它是一个简单的正向扫描，硬件预取器将在很少的迭代后检测并自动完成。

Answer 2

我没有专门的SSE经验。但我会尝试按如下方式优化代码。

// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.

void packSlow1(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    for(uint32_t j=0;j<NCOL;j+=4*4)
    {
        uint64_t *dst;
        uint64_t src[4][4];

        // __m128i s01 = _mm_set_epi64(ptr[0], ptr[1]);
        // __m128i s23 = _mm_set_epi64(ptr[2], ptr[3]);
        // ---- or ----
        // __m128i s0123 = _mm_load_si128(ptr[0])
        // __m128i s01   = _?????_(s0123) // some instruction to extract s01 from s0123
        // __m128i s23   = _?????_(s0123) // some instruction to extract s23

        src[0][0] = ptr[0] & 0x3ff;
        src[0][1] = ptr[1] & 0x3ff;
        src[0][2] = ptr[2] & 0x3ff;
        src[0][3] = ptr[3] & 0x3ff;

        src[1][0] = ptr[4] & 0x3ff;
        src[1][1] = ptr[5] & 0x3ff;
        src[1][2] = ptr[6] & 0x3ff;
        src[1][3] = ptr[7] & 0x3ff;

        src[2][0] = ptr[8] & 0x3ff;
        src[2][1] = ptr[9] & 0x3ff;
        src[2][2] = ptr[10] & 0x3ff;
        src[2][3] = ptr[11] & 0x3ff;

        src[3][0] = ptr[12] & 0x3ff;
        src[3][1] = ptr[13] & 0x3ff;
        src[3][2] = ptr[14] & 0x3ff;
        src[3][3] = ptr[15] & 0x3ff;

        // looks like _mm_maskmoveu_si128 can store result efficiently
        dst = (uint64_t*)streamBuffer;
        dst[0] = src[0][0] | (src[0][1] << 10) | (src[0][2] << 20) | (src[0][3] << 30);

        dst = (uint64_t*)(streamBuffer + 5);
        dst[0] = src[1][0] | (src[1][1] << 10) | (src[1][2] << 20) | (src[1][3] << 30);

        dst = (uint64_t*)(streamBuffer + 10);
        dst[0] = src[2][0] | (src[2][1] << 10) | (src[2][2] << 20) | (src[2][3] << 30);

        dst = (uint64_t*)(streamBuffer + 15);
        dst[0] = src[3][0] | (src[3][1] << 10) | (src[3][2] << 20) | (src[3][3] << 30);

        streamBuffer += 5 * 4;
        ptr += 4 * 4;
    }
}

更新：

基准：

Ubuntu 12.04, x86_64 GNU/Linux, gcc v4.6.3 (Virtual Box)
Intel Core i7 (Macbook pro)
compiled with -O3

5717633386 (1X):   packSlow
3868744491 (1.4X): packSlow1 (version from the post)
4471858853 (1.2X): packFast2 (from Mark Lakata's post)
1820784764 (3.1X): packFast3 (version from the post)

Windows 8.1, x64, VS2012 Express
Intel Core i5 (Asus)
compiled with standard 'Release' options and SSE2 enabled

00413185 (1X)   packSlow
00782005 (0.5X) packSlow1
00236639 (1.7X) packFast2
00148906 (2.8X) packFast3

我看到使用Windows 8.1和VS Express 2012的华硕笔记本电脑完全不同的结果（用-O2编译的代码）。 packSlow1比原始packSlow慢2倍，而packFast2比packSlow快1.7倍（不是2.9倍）。在研究了这个问题之后，我明白了原因。 VC编译器无法将所有常量保存到packFast2的XMMS寄存器中，因此它将额外的内存访问插入到循环中（请参阅生成的程序集）。缓慢的内存访问解释了性能下降。

为了获得更稳定的结果，我将像素缓冲区增加到256x512，并将循环计数器从1000增加到10000000/256。

这是我的SSE优化功能版本。

// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.

void packFast3(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i m0 = _mm_set_epi16(0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF);
    const __m128i m1 = _mm_set_epi16(0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0);
    const __m128i m2 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
    const __m128i m3 = _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
    const __m128i m4 = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
    const __m128i m5 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
    __m128i s0, t0, r0, x0, x1;

    // unrolled and normal loop gives the same result
    for(uint32_t j=0;j<NCOL;j+=8)
    {
        // load 8 samples into s0
        s0 = _mm_loadu_si128((__m128i*)ptr);            // s0=00070006_00050004_00030002_00010000

        // join 16-bit samples into 32-bit words
        x0 = _mm_and_si128(s0, m0);                     // x0=00000006_00000004_00000002_00000000
        x1 = _mm_and_si128(s0, m1);                     // x1=00070000_00050000_00030000_00010000
        t0 = _mm_or_si128(x0, _mm_srli_epi32(x1, 6));   // t0=00001c06_00001404_00000c02_00000400

        // join 32-bit words into 64-bit dwords
        x0 = _mm_and_si128(t0, m2);                     // x0=00000000_00001404_00000000_00000400
        x1 = _mm_and_si128(t0, m3);                     // x1=00001c06_00000000_00000c02_00000000
        t0 = _mm_or_si128(x0, _mm_srli_epi64(x1, 12));  // t0=00000001_c0601404_00000000_c0200400

        // join 64-bit dwords
        x0 = _mm_and_si128(t0, m4);                     // x0=00000000_00000000_00000000_c0200400
        x1 = _mm_and_si128(t0, m5);                     // x1=00000001_c0601404_00000000_00000000
        r0 = _mm_or_si128(x0, _mm_srli_si128(x1, 3));   // r0=00000000_000001c0_60140400_c0200400

        // and store result
        _mm_storeu_si128((__m128i*)streamBuffer, r0);

        streamBuffer += 10;
        ptr += 8;
    }
}

Answer 3

我想出了一个使用SIMD的“更好”的解决方案，但它并没有利用并行化，只是更有效的加载和存储（我认为）。

我在这里发帖以供参考，不一定是最佳答案。

基准是（任意滴答）

 gcc4.8.1 -O3    VS2012 /O2      Implementation
 ----------------------------------------- 
 369 (1X)        3394 (1X)       packSlow (original code)
 212 (1.7X)      2010 (1.7X)     packSlow (from @alexander)
 147 (2.5X)      1178 (2.9X)     packFast2 (below)

这是代码。基本上是@ alexander的代码，除了使用128位寄存器而不是64位寄存器，并且展开2x而不是4x。

void packFast2(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
    const __m128i mask0 = _mm_set_epi16(0,0,0,0,0,0,0,0x3FF);
    const __m128i mask1 = _mm_set_epi16(0,0,0,0,0,0,0x3FF,0);
    const __m128i mask2 = _mm_set_epi16(0,0,0,0,0,0x3FF,0,0);
    const __m128i mask3 = _mm_set_epi16(0,0,0,0,0x3FF,0,0,0);
    const __m128i mask4 = _mm_set_epi16(0,0,0,0x3FF,0,0,0,0);
    const __m128i mask5 = _mm_set_epi16(0,0,0x3FF,0,0,0,0,0);
    const __m128i mask6 = _mm_set_epi16(0,0x3FF,0,0,0,0,0,0);
    const __m128i mask7 = _mm_set_epi16(0x3FF,0,0,0,0,0,0,0);

    for(uint32_t j=0;j<NCOL;j+=16)
    {
        __m128i s = _mm_load_si128((__m128i*)ptr); // load 8 16 bit values
        __m128i s2 = _mm_load_si128((__m128i*)(ptr+8)); // load 8 16 bit values

        __m128i a = _mm_and_si128(s,mask0);
        a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask1),6));
        a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask2),12));
        a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask3),18));
        a = _mm_or_si128( a, _mm_srli_si128 (_mm_and_si128(s, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8)
        a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes.
        a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits).
        a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask7),42));

        _mm_storeu_si128((__m128i*)streamBuffer, a);

        __m128i a2 = _mm_and_si128(s2,mask0);
        a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask1),6));
        a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask2),12));
        a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask3),18));
        a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_and_si128(s2, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8)
        a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes.
        a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits).
        a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask7),42));

        _mm_storeu_si128((__m128i*)(streamBuffer+10), a2);

        streamBuffer += 20 ;
        ptr += 16 ;
    }
}

使用SIMD将10位值打包到字节流中

3 个答案: