Question

延迟是这里最关心的问题。我发现尝试通过OpenGL渲染3个带有RGBA叠加的1920x1080视频输入到各个窗口有限制。我可以渲染两个带覆盖的窗口或3个没有覆盖的窗口，但是当引入第三个窗口时，渲染停顿是显而易见的。我认为这个问题是由于在RGB视频纹理上过度使用glAlphaFunc（）到基于叠加和RGBA的纹理。为了减少过度使用，我的想法是将一些叠加功能移动到CPU中（因为我有很多CPU - 双十六进制Xeon）。这样做的理想位置是将源RGB图像复制到映射的PBO并用RGBA覆盖中的RGB值替换RGB值，其中A> 1。 0.

我尝试使用英特尔IPP方法，但没有可用的方法不涉及多个调用，导致延迟太多。我尝试过直接的C代码，但这比我允许的33毫秒要长。我需要帮助创建一个优化的程序集或基于SSE的例程，以提供最小的延迟。

使用＆gt;编译以下代码g ++ -fopenmp -O2 -mtune = native

为了清晰起见，

基本C功能：

void copyAndOverlay(const uint8_t* aSourceRGB, const uint8_t* aOverlayRGBA, uint8_t* aDestinationRGB, int aWidth, int aHeight) {
    int i;
#pragma omp parallel for
    for (i=0; i<aWidth*aHeight; ++i) {
        if (0 == aOverlayRGBA[i*4+3]) {
           aDestinationRGB[i*3] = aSourceRGB[i*3]; // R
           aDestinationRGB[i*3+1] = aSourceRGB[i*3+1]; // G
           aDestinationRGB[i*3+2] = aSourceRGB[i*3+2]; // B
        } else {
           aDestinationRGB[i*3] = aOverlayRGBA[i*4]; // R
           aDestinationRGB[i*3+1] = aOverlayRGBA[i*4+1]; // G
           aDestinationRGB[i*3+2] = aOverlayRGBA[i*4+2]; // B
        }
    }
}

uint64_t getTime() {
  struct timeval tNow;
  gettimeofday(&tNow, NULL);
  return (uint64_t)tNow.tv_sec * 1000000 + (uint64_t)tNow.tv_usec;
}

int main(int argc, char **argv) {
  int pixels = _WIDTH_ * _HEIGHT_ * 3;
  uint8_t *rgba = new uint8_t[_WIDTH_ * _HEIGHT_ * 4];
  uint8_t *src = new uint8_t[pixels];
  uint8_t *dst = new uint8_t[pixels];

  uint64_t tStart = getTime();

  for (int t=0; t<1000; ++t) {
    copyAndOverlay(src, rgba, dst, _WIDTH_, _HEIGHT_);
  }

  printf("delta: %lu\n", (getTime() - tStart) / 1000);

  delete [] rgba;
  delete [] src;
  delete [] dst; 

  return 0;
}

Answer 1

这是一个SSE4实现，比你在问题中发布的代码快了不到5倍（没有循环的并行化）。如上所述，它仅适用于16字节对齐并且大小为64的倍数的RGBA缓冲区，以及16字节对齐并且大小为48的倍数的RGB缓冲区。尺寸将要求与您的1920x1080分辨率完美匹配，并且您可能需要添加代码以确保缓冲区是16字节对齐的。

void copyAndOverlay(const uint8_t* aSourceRGB, const uint8_t* aOverlayRGBA, uint8_t* aDestinationRGB, int aWidth, int aHeight) {
  __m128i const ocmp     = _mm_setzero_si128();
  __m128i const omskshf1 = _mm_set_epi32(0x00000000, 0x0F0F0F0B, 0x0B0B0707, 0x07030303);
  __m128i const omskshf2 = _mm_set_epi32(0x07030303, 0x00000000, 0x0F0F0F0B, 0x0B0B0707);
  __m128i const omskshf3 = _mm_set_epi32(0x0B0B0707, 0x07030303, 0x00000000, 0x0F0F0F0B);
  __m128i const omskshf4 = _mm_set_epi32(0x0F0F0F0B, 0x0B0B0707, 0x07030303, 0x00000000);
  __m128i const ovalshf1 = _mm_set_epi32(0x00000000, 0x0E0D0C0A, 0x09080605, 0x04020100);
  __m128i const ovalshf2 = _mm_set_epi32(0x04020100, 0x00000000, 0x0E0D0C0A, 0x09080605);
  __m128i const ovalshf3 = _mm_set_epi32(0x09080605, 0x04020100, 0x00000000, 0x0E0D0C0A);
  __m128i const ovalshf4 = _mm_set_epi32(0x0E0D0C0A, 0x09080605, 0x04020100, 0x00000000);
  __m128i const blndmsk1 = _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000);
  __m128i const blndmsk2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
  __m128i const blndmsk3 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
  __m128i       a, b, c, x, y, z, w, p, q, r, s;
  uint8_t const *const aSourceRGBPast = aSourceRGB + 3 * aWidth * aHeight;

  while (aSourceRGB != aSourceRGBPast) {
    // source:
    //  aaabbbcccdddeeef
    //  ffggghhhiiijjjkk
    //  klllmmmnnnoooppp
    //
    // overlay:
    //  aaaabbbbccccdddd
    //  eeeeffffgggghhhh
    //  iiiijjjjkkkkllll
    //  mmmmnnnnoooopppp

    // load source
    a = _mm_load_si128((__m128i const*)(aSourceRGB       ));
    b = _mm_load_si128((__m128i const*)(aSourceRGB   + 16));
    c = _mm_load_si128((__m128i const*)(aSourceRGB   + 32));

    // load overlay
    x = _mm_load_si128((__m128i const*)(aOverlayRGBA     ));
    y = _mm_load_si128((__m128i const*)(aOverlayRGBA + 16));
    z = _mm_load_si128((__m128i const*)(aOverlayRGBA + 32));
    w = _mm_load_si128((__m128i const*)(aOverlayRGBA + 48));

    // compute blend mask, put 0xFF in bytes equal to zero
    p = _mm_cmpeq_epi8(x, ocmp);
    q = _mm_cmpeq_epi8(y, ocmp);
    r = _mm_cmpeq_epi8(z, ocmp);
    s = _mm_cmpeq_epi8(w, ocmp);

    // align overlay to be condensed to 3-byte color
    x = _mm_shuffle_epi8(x, ovalshf1);
    y = _mm_shuffle_epi8(y, ovalshf2);
    z = _mm_shuffle_epi8(z, ovalshf3);
    w = _mm_shuffle_epi8(w, ovalshf4);

    // condense overlay to 3-btye color
    x = _mm_blendv_epi8(x, y, blndmsk1);
    y = _mm_blendv_epi8(y, z, blndmsk2);
    z = _mm_blendv_epi8(z, w, blndmsk3);

    // align blend mask to be condensed to 3-byte color
    p = _mm_shuffle_epi8(p, omskshf1);
    q = _mm_shuffle_epi8(q, omskshf2);
    r = _mm_shuffle_epi8(r, omskshf3);
    s = _mm_shuffle_epi8(s, omskshf4);

    // condense blend mask to 3-btye color
    p = _mm_blendv_epi8(p, q, blndmsk1);
    q = _mm_blendv_epi8(q, r, blndmsk2);
    r = _mm_blendv_epi8(r, s, blndmsk3);

    // select from overlay and source based on blend mask
    x = _mm_blendv_epi8(x, a, p);
    y = _mm_blendv_epi8(y, b, q);
    z = _mm_blendv_epi8(z, c, r);

    // write colors to destination
    _mm_store_si128((__m128i*)(aDestinationRGB     ), x);
    _mm_store_si128((__m128i*)(aDestinationRGB + 16), y);
    _mm_store_si128((__m128i*)(aDestinationRGB + 32), z);

    // update poniters
    aSourceRGB      += 48;
    aOverlayRGBA    += 64;
    aDestinationRGB += 48;
  }
}

使用RGB源和RGBA覆盖实现近乎实时的CPU功能，如glAlphaFunc（GL_GREATER）

1 个答案: