在我正在开发的应用程序中,我一直在努力解决网络编码的性能问题(请参阅Optimzing SSE-code,Improving performance of network coding-encoding和OpenCL distribution)。现在我非常接近达到可接受的性能。这是最内层循环的当前状态(大约是执行时间的99%):
while(elementIterations-- >0)
{
unsigned int firstMessageField = *(currentMessageGaloisFieldsArray++);
unsigned int secondMessageField = *(currentMessageGaloisFieldsArray++);
__m128i valuesToMultiply = _mm_set_epi32(0, secondMessageField, 0, firstMessageField);
__m128i mulitpliedHalves = _mm_mul_epu32(valuesToMultiply, fragmentCoefficentVector);
}
您对如何进一步优化这一点有什么建议吗?我知道没有更多的背景很难做到,但任何帮助都会受到赞赏!
答案 0 :(得分:9)
现在我醒了,这是我的答案:
在原始代码中,瓶颈几乎肯定是_mm_set_epi32
。这个单一的内在函数会编译到程序集中的这个混乱中:
633415EC xor edi,edi
633415EE movd xmm3,edi
...
633415F6 xor ebx,ebx
633415F8 movd xmm4,edi
633415FC movd xmm5,ebx
63341600 movd xmm0,esi
...
6334160B punpckldq xmm5,xmm3
6334160F punpckldq xmm0,xmm4
...
63341618 punpckldq xmm0,xmm5
这是什么? 9指令?!?!?!纯净的开销...
另一个看似奇怪的地方是编译器没有合并添加和加载:
movdqa xmm3,xmmword ptr [ecx-10h]
paddq xmm0,xmm3
应该已合并到:
paddq xmm0,xmmword ptr [ecx-10h]
我不确定编译器是否已经死了,或者它是否真的有合理的理由这样做......无论如何,与_mm_set_epi32
相比,这是一件小事。
免责声明:我将从此处提供的代码违反严格别名。但是,通常需要非标准的兼容方法来实现最佳性能。
解决方案1:无矢量化
此解决方案假定allZero
实际上都是零。
循环实际上比它看起来更简单。由于没有很多算术,最好不要矢量化:
// Test Data
unsigned __int32 fragmentCoefficentVector = 1000000000;
__declspec(align(16)) int currentMessageGaloisFieldsArray_[8] = {10,11,12,13,14,15,16,17};
int *currentMessageGaloisFieldsArray = currentMessageGaloisFieldsArray_;
__m128i currentUnModdedGaloisFieldFragments_[8];
__m128i *currentUnModdedGaloisFieldFragments = currentUnModdedGaloisFieldFragments_;
memset(currentUnModdedGaloisFieldFragments,0,8 * sizeof(__m128i));
int elementIterations = 4;
// The Loop
while (elementIterations > 0){
elementIterations -= 1;
// Default 32 x 32 -> 64-bit multiply code
unsigned __int64 r0 = currentMessageGaloisFieldsArray[0] * (unsigned __int64)fragmentCoefficentVector;
unsigned __int64 r1 = currentMessageGaloisFieldsArray[1] * (unsigned __int64)fragmentCoefficentVector;
// Use this for Visual Studio. VS doesn't know how to optimize 32 x 32 -> 64-bit multiply
// unsigned __int64 r0 = __emulu(currentMessageGaloisFieldsArray[0], fragmentCoefficentVector);
// unsigned __int64 r1 = __emulu(currentMessageGaloisFieldsArray[1], fragmentCoefficentVector);
((__int64*)currentUnModdedGaloisFieldFragments)[0] += r0 & 0x00000000ffffffff;
((__int64*)currentUnModdedGaloisFieldFragments)[1] += r0 >> 32;
((__int64*)currentUnModdedGaloisFieldFragments)[2] += r1 & 0x00000000ffffffff;
((__int64*)currentUnModdedGaloisFieldFragments)[3] += r1 >> 32;
currentMessageGaloisFieldsArray += 2;
currentUnModdedGaloisFieldFragments += 2;
}
在x64上编译到这个:
$LL4@main:
mov ecx, DWORD PTR [rbx]
mov rax, r11
add r9, 32 ; 00000020H
add rbx, 8
mul rcx
mov ecx, DWORD PTR [rbx-4]
mov r8, rax
mov rax, r11
mul rcx
mov ecx, r8d
shr r8, 32 ; 00000020H
add QWORD PTR [r9-48], rcx
add QWORD PTR [r9-40], r8
mov ecx, eax
shr rax, 32 ; 00000020H
add QWORD PTR [r9-24], rax
add QWORD PTR [r9-32], rcx
dec r10
jne SHORT $LL4@main
和x86:
$LL4@main:
mov eax, DWORD PTR [esi]
mul DWORD PTR _fragmentCoefficentVector$[esp+224]
mov ebx, eax
mov eax, DWORD PTR [esi+4]
mov DWORD PTR _r0$31463[esp+228], edx
mul DWORD PTR _fragmentCoefficentVector$[esp+224]
add DWORD PTR [ecx-16], ebx
mov ebx, DWORD PTR _r0$31463[esp+228]
adc DWORD PTR [ecx-12], edi
add DWORD PTR [ecx-8], ebx
adc DWORD PTR [ecx-4], edi
add DWORD PTR [ecx], eax
adc DWORD PTR [ecx+4], edi
add DWORD PTR [ecx+8], edx
adc DWORD PTR [ecx+12], edi
add esi, 8
add ecx, 32 ; 00000020H
dec DWORD PTR tv150[esp+224]
jne SHORT $LL4@main
这些都可能比原始(SSE)代码更快...... 在x64上,展开它会让它更好。
解决方案2:SSE2整数随机播放
此解决方案将循环展开为2次迭代:
// Test Data
__m128i allZero = _mm_setzero_si128();
__m128i fragmentCoefficentVector = _mm_set1_epi32(1000000000);
__declspec(align(16)) int currentMessageGaloisFieldsArray_[8] = {10,11,12,13,14,15,16,17};
int *currentMessageGaloisFieldsArray = currentMessageGaloisFieldsArray_;
__m128i currentUnModdedGaloisFieldFragments_[8];
__m128i *currentUnModdedGaloisFieldFragments = currentUnModdedGaloisFieldFragments_;
memset(currentUnModdedGaloisFieldFragments,0,8 * sizeof(__m128i));
int elementIterations = 4;
// The Loop
while(elementIterations > 1){
elementIterations -= 2;
// Load 4 elements. If needed use unaligned load instead.
// messageField = {a, b, c, d}
__m128i messageField = _mm_load_si128((__m128i*)currentMessageGaloisFieldsArray);
// Get into this form:
// values0 = {a, x, b, x}
// values1 = {c, x, d, x}
__m128i values0 = _mm_shuffle_epi32(messageField,216);
__m128i values1 = _mm_shuffle_epi32(messageField,114);
// Multiply by "fragmentCoefficentVector"
values0 = _mm_mul_epu32(values0, fragmentCoefficentVector);
values1 = _mm_mul_epu32(values1, fragmentCoefficentVector);
__m128i halves0 = _mm_unpacklo_epi32(values0, allZero);
__m128i halves1 = _mm_unpackhi_epi32(values0, allZero);
__m128i halves2 = _mm_unpacklo_epi32(values1, allZero);
__m128i halves3 = _mm_unpackhi_epi32(values1, allZero);
halves0 = _mm_add_epi64(halves0, currentUnModdedGaloisFieldFragments[0]);
halves1 = _mm_add_epi64(halves1, currentUnModdedGaloisFieldFragments[1]);
halves2 = _mm_add_epi64(halves2, currentUnModdedGaloisFieldFragments[2]);
halves3 = _mm_add_epi64(halves3, currentUnModdedGaloisFieldFragments[3]);
currentUnModdedGaloisFieldFragments[0] = halves0;
currentUnModdedGaloisFieldFragments[1] = halves1;
currentUnModdedGaloisFieldFragments[2] = halves2;
currentUnModdedGaloisFieldFragments[3] = halves3;
currentMessageGaloisFieldsArray += 4;
currentUnModdedGaloisFieldFragments += 4;
}
汇编到此(x86):(x64没有太大差异)
$LL4@main:
movdqa xmm1, XMMWORD PTR [esi]
pshufd xmm0, xmm1, 216 ; 000000d8H
pmuludq xmm0, xmm3
movdqa xmm4, xmm0
punpckhdq xmm0, xmm2
paddq xmm0, XMMWORD PTR [eax-16]
pshufd xmm1, xmm1, 114 ; 00000072H
movdqa XMMWORD PTR [eax-16], xmm0
pmuludq xmm1, xmm3
movdqa xmm0, xmm1
punpckldq xmm4, xmm2
paddq xmm4, XMMWORD PTR [eax-32]
punpckldq xmm0, xmm2
paddq xmm0, XMMWORD PTR [eax]
punpckhdq xmm1, xmm2
paddq xmm1, XMMWORD PTR [eax+16]
movdqa XMMWORD PTR [eax-32], xmm4
movdqa XMMWORD PTR [eax], xmm0
movdqa XMMWORD PTR [eax+16], xmm1
add esi, 16 ; 00000010H
add eax, 64 ; 00000040H
dec ecx
jne SHORT $LL4@main
仅比两个迭代的非矢量化版本略长。这使用很少的寄存器,因此您甚至可以在x86上进一步展开它。
<强>说明:强>
_mm_set_epi32
(在原始代码中编译成大约约9条指令)可以替换为单个_mm_shuffle_epi32
。答案 1 :(得分:5)
我建议您将循环展开2倍,以便可以使用一个_mm_load_XXX加载4个messageField值,然后将这四个值解压缩为两个向量对并按照当前循环处理它们。这样,编译器就不会为_mm_set_epi32生成大量乱码,并且所有的加载和存储都将是128位SSE加载/存储。这也将使编译器有更多机会在循环内最佳地调度指令。