我正在尝试编写一个启用SSE的alpha合成器,这就是我想出的。首先,将两个4个像素的矢量混合的代码:
// alpha blend two 128-bit (16 byte) SSE vectors containing 4 pre-multiplied ARGB values each
//
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
// shuffle masks for alpha and 255 vector for 255-alpha
//
// NOTE: storing static __m128i here with _mm_set_si128 was _very_ slow, compiler doesn't seem
// to know it can store this as a const, so it had guard variables and did real static initialization,
// stick with arrays.
//
static const uint64_t allo[2] __attribute__((aligned(16))) = { 0x03ff03ff03ff03ff, 0x07ff07ff07ff07ff };
static const uint64_t alhi[2] __attribute__((aligned(16))) = { 0x0bff0bff0bff0bff, 0x0fff0fff0fff0fff };
static const uint64_t m255[2] __attribute__((aligned(16))) = { 0xff00ff00ff00ff00, 0xff00ff00ff00ff00 };
// replicate top two pixels from under
__m128i underhi = (__m128i)_mm_movehl_ps(
(__m128)under,
(__m128)under
);
__m128i u16_0 = _mm_cvtepu8_epi16(under); // convert 8-bit fields to 16-bit with zero extension
__m128i u16_1 = _mm_cvtepu8_epi16(underhi);
__m128i al8_0 = _mm_shuffle_epi8 (over, *(__m128i*)&allo); // replicate (alpha << 8) to each field
__m128i al8_1 = _mm_shuffle_epi8 (over, *(__m128i*)&alhi);
__m128i mal_0 = _mm_sub_epi8 (*(__m128i*)&m255, al8_0); // compute 255-alpha
__m128i mal_1 = _mm_sub_epi8 (*(__m128i*)&m255, al8_1);
__m128i mul_0 = _mm_mulhi_epu16 (u16_0, mal_0); // under*(255-over.alpha)
__m128i mul_1 = _mm_mulhi_epu16 (u16_1, mal_1);
__m128i pixel = _mm_packus_epi16 (mul_0, mul_1);
// add to background pixel with saturation
return _mm_adds_epi8(over, pixel);
}
其次,包装器展开多个像素操作并汇总加载/存储。达到约32像素/ iter似乎是最好的选择:
// perform N 4-pixel blending operations at once, load/blend/store paradigm. We take a template parameter
// for the size so the compiler is more likely to unroll the loops for us.
//
template <ssize_t N>
__attribute__((always_inline, optimize("unroll-loops")))
static inline void blendN(__m128i *dst, const __m128i *punder, const __m128i *pover, bool single=false) {
__m128i under[N];
__m128i over[N];
__m128i cc = _mm_loadu_si128(pover);
// load
for (ssize_t ii=0; ii < N; ii++) {
under[ii] = _mm_loadu_si128(punder+ii);
over[ii] = single ? cc : _mm_loadu_si128( pover+ii);
}
// blend
for (ssize_t ii=0; ii < N; ii++) {
under[ii] = blend4(under[ii], over[ii]);
}
// store
for (ssize_t ii=0; ii < N; ii++) {
_mm_storeu_si128(dst+ii, under[ii]);
}
}
这样称呼:
// blend 32/16/8/4 pixels at a time
ssize_t ii=0;
for (ii *= 2; ii < len/32; ii++) { blendN<8>(vdst+8*ii, vunder+8*ii, vover+8*ii); }
for (ii *= 2; ii < len/16; ii++) { blendN<4>(vdst+4*ii, vunder+4*ii, vover+4*ii); }
for (ii *= 2; ii < len/8; ii++) { blendN<2>(vdst+2*ii, vunder+2*ii, vover+2*ii); }
for (ii *= 2; ii < len/4; ii++) { blendN<1>(vdst+1*ii, vunder+1*ii, vover+1*ii); }
// handle remainder
ii *= 4;
for (; ii < len; ii++) {
*(pdst+ii) = blend(*(punder+ii), *(pover+ii));
}
使用它,我可以在i7-2600K上获得约2.5英寸/周期的吞吐量。好奇是否有人可以建议改进我的SIMD。
编辑:这是与Peter Cordes交谈后的一些更新代码。
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
// shuffle masks for alpha and 255 vector for 255-alpha
//
// NOTE: storing static __m128i is _very_ slow, compiler doesn't seem to know it can store
// this as a const, so it had guard variables and did real static initialization. Stick with
// just const
//
const __m128i allo = (__m128i)_mm_setr_epi32(0x03ff03ff, 0x03ff03ff, 0x07ff07ff, 0x07ff07ff);
const __m128i alhi = (__m128i)_mm_setr_epi32(0x0bff0bff, 0x0bff0bff, 0x0fff0fff, 0x0fff0fff);
const __m128i zero = (__m128i)_mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m128 m255 = (__m128 )_mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);
__m128i u16_0 = _mm_cvtepu8_epi16(under); // convert 8-bit fields to 16-bit with zero extension
__m128i u16_1 = _mm_unpackhi_epi8(under, zero);
__m128i al8_0 = _mm_shuffle_epi8 (over, allo); // replicate (alpha << 8) to each field
__m128i al8_1 = _mm_shuffle_epi8 (over, alhi);
__m128i mal_0 = (__m128i)_mm_xor_ps(m255, (__m128)al8_0); // compute 255-alpha
__m128i mal_1 = (__m128i)_mm_xor_ps(m255, (__m128)al8_1);
__m128i mul_0 = _mm_mulhi_epu16 (u16_0, mal_0); // under*(255-over.alpha)
__m128i mul_1 = _mm_mulhi_epu16 (u16_1, mal_1);
__m128i pixel = _mm_packus_epi16 (mul_0, mul_1);
// add to background pixel with saturation
return _mm_adds_epi8(over, pixel);
}
最大的变化是使用unpackhi而不是cvtepu8将不足像素的前8个字节扩展为16位。然后使用xor而不是减法来计算255-alpha。 xor可以在多个端口上运行,而不能在减法端口上运行。在我的i7-2600K上,这可以混合〜22亿像素/秒,这似乎足够了。
答案 0 :(得分:0)
不是您问题的直接答案,但这对于发表评论来说太长了,也许对某人有用。
将alpha排列到每个16位通道的上半部分的技巧非常巧妙,它使您可以使用_mm_mulhi_epu16
将乘积带到低位。我的问题略有不同,因为我没有预乘alpha,因此我需要能够为整个纹理指定不透明度。我将代码扩展到以下内容:
__m128i blend4(__m128i under, __m128i over, float opacity) {
const __m128i alpha16 = _mm_set1_epi16(alpha * 255);
const __m128i allo = (__m128i) _mm_setr_epi32(0xff03ff03, 0xff03ff03, 0xff07ff07, 0x0ff7ff07);
const __m128i alhi = (__m128i) _mm_setr_epi32(0xff0bff0b, 0xff0bff0b, 0xff0fff0f, 0x0fffff0f);
const __m128i zero = (__m128i) _mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m128i i255 = (__m128i) _mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);
__m128i under0 = _mm_cvtepu8_epi16(under);
__m128i under1 = _mm_unpackhi_epi8(under, zero);
__m128i over0 = _mm_cvtepu8_epi16(over);
__m128i over1 = _mm_unpackhi_epi8(over, zero);
__m128i alpha0 = _mm_mullo_epi16(_mm_shuffle_epi8(over, allo), alpha16);
__m128i alpha1 = _mm_mullo_epi16(_mm_shuffle_epi8(over, alhi), alpha16);
__m128i invAlpha0 = _mm_xor_si128(i255, alpha0);
__m128i invAlpha1 = _mm_xor_si128(i255, alpha1);
__m128i underMul0 = _mm_mulhi_epu16(under0, invAlpha0);
__m128i underMul1 = _mm_mulhi_epu16(under1, invAlpha1);
__m128i overMul0 = _mm_mulhi_epu16(over0, alpha0);
__m128i overMul1 = _mm_mulhi_epu16(over1, alpha1);
__m128i underFinal = _mm_packus_epi16(underMul0, underMul1);
__m128i overFinal = _mm_packus_epi16(overMul0, overMul1);
return _mm_adds_epu8(overFinal, underFinal);
}
我最初将alpha拖曳到每个泳道的下半部分,以便结果的高位在与alpha16
乘以{{1}之后,最终在每个泳道的上半部分结束trick俩照常运作。其余的只是简单的alpha乘法。