Question

在关于将RGB转换为RGBA和ARGB转换为BGR之前的一些问题的后续内容中，我想通过 SSE 加快 RGB到BGRA 的转换。假设有32位机器，并且想使用内在函数。我很难将源缓冲区和目标缓冲区对齐以使用128位寄存器，并寻求其他精明的矢量化解决方案。

要矢量化的例程如下......

    void RGB8ToBGRX8(int w, const void *in, void *out)
    {
        int i;
        int width = w;
        const unsigned char *src= (const unsigned char*) in;
        unsigned int *dst= (unsigned int*) out;
        unsigned int invalue, outvalue;

        for (i=0; i<width; i++, src+=3, dst++)
        {
                invalue = src[0];
                outvalue = (invalue<<16);
                invalue = src[1];
                outvalue |= (invalue<<8);
                invalue = src[2];
                outvalue |= (invalue);
                *dst = outvalue | 0xff000000;
        }
      }

这个例程主要用于大纹理（512KB），所以如果我可以并行化一些操作，那么一次处理更多像素可能是有益的。当然，我需要介绍一下。：）

编辑：

我的编译论据......

gcc -O2 main.c

Answer 1

这是使用SSSE3内在函数执行请求的操作的示例。输入和输出指针必须是16字节对齐的，并且它一次在16个像素的块上运行。

#include <tmmintrin.h>

/* in and out must be 16-byte aligned */
void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
{
    const __m128i *in_vec = in;
    __m128i *out_vec = out;

    w /= 16;

    while (w-- > 0) {
        /*             0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
         * in_vec[0]   Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
         * in_vec[1]   Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
         * in_vec[2]   Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
         */
        __m128i in1, in2, in3;
        __m128i out;

        in1 = in_vec[0];

        out = _mm_shuffle_epi8(in1,
            _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[0] = out;

        in2 = in_vec[1];

        in1 = _mm_and_si128(in1,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in2,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in1);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[1] = out;

        in3 = in_vec[2];
        in_vec += 3;

        in2 = _mm_and_si128(in2,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in3,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in2);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[2] = out;

        out = _mm_shuffle_epi8(in3,
            _mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[3] = out;

        out_vec += 4;
    }
}

Answer 2

我对你所要求的内容并不完全了解，我迫不及待地等待对你的问题作出适当的回应。与此同时，我提出的实施平均速度大约快8到10％。我正在运行Win7 64bit，使用VS2010，使用C ++进行编译以使用快速选项进行发布。

#pragma pack(push, 1)
    struct RGB {
        unsigned char r, g, b;
    };

    struct BGRA {
        unsigned char b, g, r, a;
    };
#pragma pack(pop)

    void RGB8ToBGRX8(int width, const void* in, void* out)
    {
        const RGB* src = (const RGB*)in;
        BGRA* dst = (BGRA*)out; 
        do {        
            dst->r = src->r;
            dst->g = src->g;
            dst->b = src->b;
            dst->a = 0xFF;
            src++;
            dst++;
        } while (--width);
    }

这可能会有所帮助，也可能没有帮助，但我希望如此。如果不这样做，请不要投票给我，我只是想把它移开。

我使用结构的动机是允许编译器尽可能有效地推进指针src和dst。另一个动机是限制算术运算的数量。

Answer 3

我个人发现实施以下内容给了我将BGR-24转换为ARGB-32的最佳结果。

此代码在图像上运行大约8.8ms，而上面提供的128位向量化代码以每个图像14.5ms运行。

void PixelFix(u_int32_t *buff,unsigned char *diskmem)
{
    int i,j;
    int picptr, srcptr;
    int w = 1920;
    int h = 1080;

    for (j=0; j<h; j++) {
        for (i=0; i<w; i++) {
            buff[picptr++]=(diskmem[srcptr]<<24) | (diskmem[srcptr+1]<<16) | diskmem[srcptr+2]<<8 | 0xff;
            srcptr+=3;
        }
    }
}

以前，我一直在使用这个例程（每张图片约13.2毫秒）。这里，buff是一个unsigned char *。

for (j=0; j<h; j++) {
    int srcptr = (h-j-1)*w*3;  // remove if you don't want vertical flipping
    for (i=0; i<w; i++) {
        buff[picptr+3]=diskmem[srcptr++]; // b
        buff[picptr+2]=diskmem[srcptr++]; // g
        buff[picptr+1]=diskmem[srcptr++]; // r
        buff[picptr+0]=255;               // a
        picptr+=4;
    }
}

运行2012 MacMini 2.6ghz / i7。

Answer 4

嗯...使用vImageConvert_RGB888toARGB8888非常快（15倍加速）。

高于PixelFix代码（每张图片大约6毫秒，现在在更新的硬件上）

6.373520 ms
6.383363 ms
6.413560 ms
6.278606 ms
6.293607 ms
6.368118 ms
6.338904 ms
6.389385 ms
6.365495 ms

使用vImageConvert_RGB888toARGB888，线程化（在较新的硬件上）

0.563649 ms
0.400387 ms
0.375198 ms
0.360898 ms
0.391278 ms
0.396797 ms
0.405534 ms
0.386495 ms
0.367621 ms

需要我说更多吗？

从RGB到BGRA的快速矢量化转换

4 个答案: