Question

我正在尝试编写一个启用SSE的alpha合成器，这就是我想出的。首先，将两个4个像素的矢量混合的代码：

// alpha blend two 128-bit (16 byte) SSE vectors containing 4 pre-multiplied ARGB values each
//
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
    // shuffle masks for alpha and 255 vector for 255-alpha
    //
    // NOTE: storing static __m128i here with _mm_set_si128 was _very_ slow, compiler doesn't seem
    // to know it can store this as a const, so it had guard variables and did real static initialization,
    // stick with arrays.
    //
    static const uint64_t allo[2] __attribute__((aligned(16))) = { 0x03ff03ff03ff03ff, 0x07ff07ff07ff07ff };
    static const uint64_t alhi[2] __attribute__((aligned(16))) = { 0x0bff0bff0bff0bff, 0x0fff0fff0fff0fff };
    static const uint64_t m255[2] __attribute__((aligned(16))) = { 0xff00ff00ff00ff00, 0xff00ff00ff00ff00 };

    // replicate top two pixels from under
    __m128i underhi = (__m128i)_mm_movehl_ps(
        (__m128)under,
        (__m128)under
    );

    __m128i u16_0 = _mm_cvtepu8_epi16(under);                   // convert 8-bit fields to 16-bit with zero extension
    __m128i u16_1 = _mm_cvtepu8_epi16(underhi);  
    __m128i al8_0 = _mm_shuffle_epi8 (over, *(__m128i*)&allo);  // replicate (alpha << 8) to each field
    __m128i al8_1 = _mm_shuffle_epi8 (over, *(__m128i*)&alhi);
    __m128i mal_0 = _mm_sub_epi8     (*(__m128i*)&m255, al8_0); // compute 255-alpha
    __m128i mal_1 = _mm_sub_epi8     (*(__m128i*)&m255, al8_1);
    __m128i mul_0 = _mm_mulhi_epu16  (u16_0, mal_0);            // under*(255-over.alpha)
    __m128i mul_1 = _mm_mulhi_epu16  (u16_1, mal_1);
    __m128i pixel = _mm_packus_epi16 (mul_0, mul_1);

    // add to background pixel with saturation
    return _mm_adds_epi8(over, pixel);
}

其次，包装器展开多个像素操作并汇总加载/存储。达到约32像素/ iter似乎是最好的选择：

// perform N 4-pixel blending operations at once, load/blend/store paradigm.  We take a template parameter
// for the size so the compiler is more likely to unroll the loops for us.
// 
template <ssize_t N>
__attribute__((always_inline, optimize("unroll-loops")))
static inline void blendN(__m128i *dst, const __m128i *punder, const __m128i *pover, bool single=false) {
    __m128i under[N];
    __m128i  over[N];
    __m128i  cc = _mm_loadu_si128(pover);

    // load
    for (ssize_t ii=0; ii < N; ii++) {
        under[ii] =              _mm_loadu_si128(punder+ii);
        over[ii] = single ? cc : _mm_loadu_si128( pover+ii);
    }

    // blend
    for (ssize_t ii=0; ii < N; ii++) {
        under[ii] = blend4(under[ii], over[ii]);
    }

    // store
    for (ssize_t ii=0; ii < N; ii++) {
        _mm_storeu_si128(dst+ii, under[ii]);
    }
}

这样称呼：

 // blend 32/16/8/4 pixels at a time
    ssize_t ii=0;
    for (ii *= 2; ii < len/32; ii++) { blendN<8>(vdst+8*ii, vunder+8*ii, vover+8*ii); }
    for (ii *= 2; ii < len/16; ii++) { blendN<4>(vdst+4*ii, vunder+4*ii, vover+4*ii); }
    for (ii *= 2; ii < len/8;  ii++) { blendN<2>(vdst+2*ii, vunder+2*ii, vover+2*ii); }
    for (ii *= 2; ii < len/4;  ii++) { blendN<1>(vdst+1*ii, vunder+1*ii, vover+1*ii); }

    // handle remainder
    ii *= 4;
    for (; ii < len; ii++) {
        *(pdst+ii) = blend(*(punder+ii), *(pover+ii));
    }

使用它，我可以在i7-2600K上获得约2.5英寸/周期的吞吐量。好奇是否有人可以建议改进我的SIMD。

编辑：这是与Peter Cordes交谈后的一些更新代码。

__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
    // shuffle masks for alpha and 255 vector for 255-alpha
    //
    // NOTE: storing static __m128i is _very_ slow, compiler doesn't seem to know it can store
    // this as a const, so it had guard variables and did real static initialization. Stick with 
    // just const
    //
    const __m128i allo = (__m128i)_mm_setr_epi32(0x03ff03ff, 0x03ff03ff, 0x07ff07ff, 0x07ff07ff);
    const __m128i alhi = (__m128i)_mm_setr_epi32(0x0bff0bff, 0x0bff0bff, 0x0fff0fff, 0x0fff0fff);
    const __m128i zero = (__m128i)_mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
    const __m128  m255 = (__m128 )_mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);

    __m128i u16_0 =   _mm_cvtepu8_epi16(under);               // convert 8-bit fields to 16-bit with zero extension
    __m128i u16_1 =   _mm_unpackhi_epi8(under, zero);
    __m128i al8_0 =   _mm_shuffle_epi8 (over,  allo);         // replicate (alpha << 8) to each field
    __m128i al8_1 =   _mm_shuffle_epi8 (over,  alhi);
    __m128i mal_0 = (__m128i)_mm_xor_ps(m255, (__m128)al8_0); // compute 255-alpha
    __m128i mal_1 = (__m128i)_mm_xor_ps(m255, (__m128)al8_1);
    __m128i mul_0 =   _mm_mulhi_epu16  (u16_0, mal_0);        // under*(255-over.alpha)
    __m128i mul_1 =   _mm_mulhi_epu16  (u16_1, mal_1);
    __m128i pixel =   _mm_packus_epi16 (mul_0, mul_1);

    // add to background pixel with saturation
    return _mm_adds_epi8(over, pixel);
}

最大的变化是使用unpackhi而不是cvtepu8将不足像素的前8个字节扩展为16位。然后使用xor而不是减法来计算255-alpha。 xor可以在多个端口上运行，而不能在减法端口上运行。在我的i7-2600K上，这可以混合〜22亿像素/秒，这似乎足够了。

Answer 1

不是您问题的直接答案，但这对于发表评论来说太长了，也许对某人有用。

将alpha排列到每个16位通道的上半部分的技巧非常巧妙，它使您可以使用_mm_mulhi_epu16将乘积带到低位。我的问题略有不同，因为我没有预乘alpha，因此我需要能够为整个纹理指定不透明度。我将代码扩展到以下内容：

__m128i blend4(__m128i under, __m128i over, float opacity) {
    const __m128i alpha16 = _mm_set1_epi16(alpha * 255);
    const __m128i allo = (__m128i) _mm_setr_epi32(0xff03ff03, 0xff03ff03, 0xff07ff07, 0x0ff7ff07);
    const __m128i alhi = (__m128i) _mm_setr_epi32(0xff0bff0b, 0xff0bff0b, 0xff0fff0f, 0x0fffff0f);
    const __m128i zero = (__m128i) _mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
    const __m128i i255 = (__m128i) _mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);

    __m128i under0 = _mm_cvtepu8_epi16(under);
    __m128i under1 = _mm_unpackhi_epi8(under, zero);
    __m128i over0 = _mm_cvtepu8_epi16(over);
    __m128i over1 = _mm_unpackhi_epi8(over, zero);
    __m128i alpha0 = _mm_mullo_epi16(_mm_shuffle_epi8(over, allo), alpha16);
    __m128i alpha1 = _mm_mullo_epi16(_mm_shuffle_epi8(over, alhi), alpha16);
    __m128i invAlpha0 = _mm_xor_si128(i255, alpha0);
    __m128i invAlpha1 = _mm_xor_si128(i255, alpha1);
    __m128i underMul0 = _mm_mulhi_epu16(under0, invAlpha0);
    __m128i underMul1 = _mm_mulhi_epu16(under1, invAlpha1);
    __m128i overMul0 = _mm_mulhi_epu16(over0, alpha0);
    __m128i overMul1 = _mm_mulhi_epu16(over1, alpha1);
    __m128i underFinal = _mm_packus_epi16(underMul0, underMul1);
    __m128i overFinal = _mm_packus_epi16(overMul0, overMul1);
    return _mm_adds_epu8(overFinal, underFinal);
}

我最初将alpha拖曳到每个泳道的下半部分，以便结果的高位在与alpha16乘以{{1}之后，最终在每个泳道的上半部分结束trick俩照常运作。其余的只是简单的alpha乘法。

用于预乘ARGB的SSE alpha混合

1 个答案: