What would be an efficient way to optimize the following code with sse ?
uint16_t change1= ... ;
uint8_t* pSrc = ... ;
uint8_t* pDest = ... ;
if(change1 & 0x0001) *pDest++ = pSrc[0];
if(change1 & 0x0002) *pDest++ = pSrc[1];
if(change1 & 0x0004) *pDest++ = pSrc[2];
if(change1 & 0x0008) *pDest++ = pSrc[3];
if(change1 & 0x0010) *pDest++ = pSrc[4];
if(change1 & 0x0020) *pDest++ = pSrc[5];
if(change1 & 0x0040) *pDest++ = pSrc[6];
if(change1 & 0x0080) *pDest++ = pSrc[7];
if(change1 & 0x0100) *pDest++ = pSrc[8];
if(change1 & 0x0200) *pDest++ = pSrc[9];
if(change1 & 0x0400) *pDest++ = pSrc[10];
if(change1 & 0x0800) *pDest++ = pSrc[11];
if(change1 & 0x1000) *pDest++ = pSrc[12];
if(change1 & 0x2000) *pDest++ = pSrc[13];
if(change1 & 0x4000) *pDest++ = pSrc[14];
if(change1 & 0x8000) *pDest++ = pSrc[15];
So far I am using a quite big lookup table for it, but I really want to get rid of it:
SSE3Shuffle::Entry& e0 = SSE3Shuffle::g_Shuffle.m_Entries[change1];
_mm_storeu_si128((__m128i*)pDest, _mm_shuffle_epi8(*(__m128i*)pSrc, e0.mask));
pDest += e0.offset;
答案 0 :(得分:3)
假设:
change1 = _mm_movemask_epi8(bytemask);
offset = popcnt(change1);
似乎1KiB表和两个shuffle仅比1MiB表和1个shuffle慢约10%。
*edited
static const uint64_t table[128] __attribute__((aligned(64))) = {
0x0706050403020100, 0x0007060504030201, ..., 0x0605040302010700, 0x0605040302010007
};
const __m128i mask_01 = _mm_set1_epi8( 0x01 );
__m128i vector0 = _mm_loadu_si128((__m128i*)src);
__m128i vector1 = _mm_shuffle_epi32( vector0, 0x0E );
__m128i bytemask0 = _mm_cmpeq_epi8( ???, vector0); // detect bytes to omit
uint32_t bitmask0 = _mm_movemask_epi8(bytemask0) & 0x7F7F;
__m128i hsum = _mm_sad_epu8(_mm_add_epi8(bytemask0, mask_01), _mm_setzero_si128());
vector0 = _mm_shuffle_epi8(vector0, _mm_loadl_epi64((__m128i*) &table[(uint8_t)bitmask0]));
_mm_storel_epi64((__m128i*)dst, vector0);
dst += (uint32_t)_mm_cvtsi128_si32(hsum);
vector1 = _mm_shuffle_epi8(vector1, _mm_loadl_epi64((__m128i*) &table[bitmask0 >> 8]));
_mm_storel_epi64((__m128i*)dst, vector1);
dst += (uint32_t)_mm_cvtsi128_si32(_mm_unpackhi_epi64(hsum, hsum));
通过前缀sums和bit twiddling生成shuffle掩码似乎大约是基于表格的方法速度的一半。
使用avx2,生成随机播放掩码的速度几乎与lut方法相同。
const __m256i mask_index = _mm256_set1_epi64x( 0x80A0908884828180 );
const __m256i mask_shift = _mm256_set1_epi64x( 0x0000000008080808 );
const __m256i mask_invert = _mm256_set1_epi64x( 0x0020100800000000 );
const __m256i mask_fixup = _mm256_set_epi32(
0x08080808, 0x0F0F0F0F, 0x00000000, 0x07070707,
0x08080808, 0x0F0F0F0F, 0x00000000, 0x07070707
);
const __m256i lut = _mm256_set_epi32(
0x04050607, // 0x03020100', 0x000000'07
0x04050704, // 0x030200'00, 0x0000'0704
0x04060705, // 0x030100'00, 0x0000'0705
0x04070504, // 0x0300'0000, 0x00'070504
0x05060706, // 0x020100'00, 0x0000'0706
0x05070604, // 0x0200'0000, 0x00'070604
0x06070605, // 0x0100'0000, 0x00'070605
0x07060504 // 0x00'000000, 0x'07060504
);
__m256i r0,r1,r2,r3,r4;
r0 = _mm256_loadu_si256(src++);
r1 = _mm256_cmpeq_epi8( ???, r0); // detect changes
r2 = _mm256_andnot_si256(r1, mask_index);
r1 = _mm256_and_si256(r1, mask_shift);
r2 = _mm256_sad_epu8(r2, mask_invert); // bitmap[0:5], popcnt[7:15]
r1 = _mm256_sad_epu8(r1, _mm256_setzero_si256()); // shift amount
r3 = _mm256_slli_epi64(r2, 29); // move hi index to 2nd dword
r4 = _mm256_srli_epi64(r2, 7); // popcnt
r2 = _mm256_or_si256(r2, r3);
r2 = _mm256_permutevar8x32_epi32(lut, r2);
r2 = _mm256_xor_si256(r2, mask_fixup);
r2 = _mm256_srlv_epi64(r2, r1);
r0 = _mm256_shuffle_epi8(r0, r2);
*((uint64_t*)dst) = _mm256_extract_epi64(r0, 0);
dst += _mm256_extract_epi64(r4, 0);
*((uint64_t*)dst) = _mm256_extract_epi64(r0, 1);
dst += _mm256_extract_epi64(r4, 1);
*((uint64_t*)dst) = _mm256_extract_epi64(r0, 2);
dst += _mm256_extract_epi64(r4, 2);
*((uint64_t*)dst) = _mm256_extract_epi64(r0, 3);
dst += _mm256_extract_epi64(r4, 3);
基准&代码:https://github.com/aqrit/despacer
答案 1 :(得分:2)
如果有人愿意在haswell及以后使用BMI2,可以使用pdep
首先从uint64_t压缩掉不需要的半字节,然后使用pext
将结果分散到随机掩码。< / p>
// Step 1 -- replicate mask to nibbles
uint64_t change4 = pdep(change1, 0x1111111111111111ULL) * 0x0F;
// Step 2 -- extract index from array of nibbles
uint64_t indices = pext(0xfedcba09876543210, change4);
// Step 3 -- interleave nibbles to octects
uint64_t high = pdep(indices >> 32ULL,0x0F0F0F0F0F0F0F0F);
uint64_t low = pdep(indices, 0x0F0F0F0F0F0F0F0FULL);
// Step 4 -- use these two masks to compress pSrc
__m128i compressed = _mm_shuffle_epi8(pSrc, _mm_set_epi64(high, low));
// Step 5 -- store 16 bytes unaligned
_mm_storeu_si128(pDst, compressed);
// Step 6 -- increment target pointer
pDst += __mm_popcnt(change1);
其他变体(基于累积和或从XX23456789abXXef中排除'X'(或零位)将首先需要一些技术将uint16_t中的位均匀地扩展到__m128i(即movemask_epi8的反转)。
然而,64k条目LUT可以分为顶部和底部:
int c = change1 & 0xff;
int p = __popcount(c);
uint64_t a = LUT256[c]; // low part of index
uint64_t b = LUT256[change1 >> 8]; // top part of index
b += addlut9[p]; // 0x0101010101010101 * p
// Then must concatenate b|a at pth position of 'a'
if (p < 8)
{
a |= b << (8*(8-p));
b >>= 8*p;
}
__m128i d = _mm_shuffle_epi8(_mm_loadu_si128(pSrc),_mm_set1_epi64(b,a));
// and continue with steps 5 and 6 as before