我正在尝试优化VLC中使用的例程,将NV12帧转换为YV12帧。
对于背景信息,NV12与YV12相同,不同之处在于U和V色度平面是交错的。 因此,要将一种格式转换为另一种格式,只需将一个频道解交错,例如: UVUVUVUVUVUVU 变 UUUUUUU VVVVVVV
现在,这个例程的主要问题是它需要一个16字节对齐的内存缓存作为中间存储 因此,例程首先将数据解交织到缓存中(最大4kiB),然后将缓存中找到的结果复制回目标帧。
我已经重写了这个函数,因此它不需要使用缓存,在需要时使用SSE2 / 3指令处理未对齐的内存,并在可能的情况下使用对齐的内存。
代码如下:
static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
uint8_t *cache, size_t cache_size,
unsigned width, unsigned height, unsigned cpu)
{
VLC_UNUSED(cache);
VLC_UNUSED(cache_size);
const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1, 3, 5, 7, 9, 11, 13, 15 };
const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
const bool aligned = ((uintptr_t)src & 0xf) == 0;
asm volatile ("mfence");
#define LOAD64A \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
#define LOAD64U \
"movdqu 0(%[src]), %%xmm0\n" \
"movdqu 16(%[src]), %%xmm1\n" \
"movdqu 32(%[src]), %%xmm2\n" \
"movdqu 48(%[src]), %%xmm3\n"
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
"movhpd %%xmm0, 0(%[dst2])\n" \
"movhpd %%xmm1, 8(%[dst2])\n" \
"movq %%xmm2, 16(%[dst1])\n" \
"movq %%xmm3, 24(%[dst1])\n" \
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
if (aligned)
{
for (unsigned y = 0; y < height; y++)
{
unsigned x = 0;
#ifdef CAN_COMPILE_SSSE3
if (vlc_CPU_SSSE3()) {
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[shuffle]), %%xmm7\n"
LOAD64A
"pshufb %%xmm7, %%xmm0\n"
"pshufb %%xmm7, %%xmm1\n"
"pshufb %%xmm7, %%xmm2\n"
"pshufb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
}
} else
#endif
{
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[mask]), %%xmm7\n"
LOAD64A
"movdqa %%xmm0, %%xmm4\n"
"movdqa %%xmm1, %%xmm5\n"
"movdqa %%xmm2, %%xmm6\n"
"psrlw $8, %%xmm0\n"
"psrlw $8, %%xmm1\n"
"pand %%xmm7, %%xmm4\n"
"pand %%xmm7, %%xmm5\n"
"pand %%xmm7, %%xmm6\n"
"packuswb %%xmm4, %%xmm0\n"
"packuswb %%xmm5, %%xmm1\n"
"pand %%xmm3, %%xmm7\n"
"psrlw $8, %%xmm2\n"
"psrlw $8, %%xmm3\n"
"packuswb %%xmm6, %%xmm2\n"
"packuswb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
}
for (; x < width; x++) {
dstu[x] = src[2*x+0];
dstv[x] = src[2*x+1];
}
src += src_pitch;
dstu += dstu_pitch;
dstv += dstv_pitch;
}
}
else
{
for (unsigned y = 0; y < height; y++)
{
unsigned x = 0;
#ifdef CAN_COMPILE_SSSE3
if (vlc_CPU_SSSE3()) {
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[shuffle]), %%xmm7\n"
LOAD64U
"pshufb %%xmm7, %%xmm0\n"
"pshufb %%xmm7, %%xmm1\n"
"pshufb %%xmm7, %%xmm2\n"
"pshufb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
}
} else
#endif
{
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[mask]), %%xmm7\n"
LOAD64U
"movdqu %%xmm0, %%xmm4\n"
"movdqu %%xmm1, %%xmm5\n"
"movdqu %%xmm2, %%xmm6\n"
"psrlw $8, %%xmm0\n"
"psrlw $8, %%xmm1\n"
"pand %%xmm7, %%xmm4\n"
"pand %%xmm7, %%xmm5\n"
"pand %%xmm7, %%xmm6\n"
"packuswb %%xmm4, %%xmm0\n"
"packuswb %%xmm5, %%xmm1\n"
"pand %%xmm3, %%xmm7\n"
"psrlw $8, %%xmm2\n"
"psrlw $8, %%xmm3\n"
"packuswb %%xmm6, %%xmm2\n"
"packuswb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
}
for (; x < width; x++) {
dstu[x] = src[2*x+0];
dstv[x] = src[2*x+1];
}
src += src_pitch;
dstu += dstu_pitch;
dstv += dstv_pitch;
}
}
#undef STORE2X32
#undef LOAD64U
#undef LOAD64A
}
现在,单独对此功能进行基准测试,在i7-2600 CPU(ivybridge 3.4GHz)上运行速度提高约26%,在i7-4650U(运行速度为1.7GHz)上运行速度提高一倍,速度比原来高出30%功能
当你从2次读取+ 2次写入到1次读取+ 1次写入时,预计会出现这种情况。
但是,在VLC中使用时(该功能用于显示通过Intel VAAPI接口解码的每个帧),同一视频的CPU使用率从20%跳至32-34%
所以我很困惑为什么会这样。以及如何解决这个问题。 我曾预料到会有相反的结果。 这两个例程都使用SSE2 / 3,一个运行速度更快,但会导致CPU使用率增加
感谢
答案 0 :(得分:2)
确定。
我发现了发生了什么。
虽然传统内存的新例程要快得多,但在处理由硬件解码方法生成的帧时,它实际上更慢:
这篇英特尔白皮书解释了一些事情: https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers
我所有的基准测试和测试都是使用传统分配的内存完成的。不是来自Uncacheable Speculative Write Combining(USWC)
回到绘图板