这是对this question的跟进。
下面的代码用于4x4矩阵乘法C = AB在所有优化设置上在ICC上编译良好。它在-O1和-O2上正确执行,但在-O3上给出了错误的结果。问题似乎来自_mm256_storeu_pd操作,因为用下面的asm语句替换它(并且只有它)会在执行后给出正确的结果。有什么想法吗?
"styles": [
"../node_modules/vis/dist/vis-network.min.css",
"styles.css",
"../node_modules/codemirror/lib/codemirror.css",
"../node_modules/rm-emoji-picker/dist/emojipicker.css"
],
"scripts": [
"../node_modules/jquery/dist/jquery.js",
"../node_modules/codemirror/lib/codemirror.js",
"../node_modules/codemirror/mode/python/python.js",
"../node_modules/rm-emoji-picker/src/js/EmojiPicker.js"
],
此外,这是由ICC生成的程序集。箭头分别表示带有_mm256_storeu_pd或asm语句的行。 RunIntrinsics_FMA_UnalignedCopy_Struct是一个从SourceMatrix获取存储数字并调用矩阵乘法例程的函数。
-O2 -xcore-avx2
ICC Test`RunIntrinsics_FMA_UnalignedCopy_Struct:
inline void RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(double *A, double *B, double *C)
{
size_t i;
/* the registers you use */
__m256d a0, a1, a2, a3, b0, b1, b2, b3, sum;
// __m256d *C256 = (__m256d *)C;
/* load values from B */
b0 = _mm256_loadu_pd(&B[0]);
b1 = _mm256_loadu_pd(&B[4]);
b2 = _mm256_loadu_pd(&B[8]);
b3 = _mm256_loadu_pd(&B[12]);
for (i = 0; i < 4; i++) {
/* load values from A */
a0 = _mm256_set1_pd(A[4*i + 0]);
a1 = _mm256_set1_pd(A[4*i + 1]);
a2 = _mm256_set1_pd(A[4*i + 2]);
a3 = _mm256_set1_pd(A[4*i + 3]);
sum = _mm256_mul_pd(a0, b0);
sum = _mm256_fmadd_pd(a1, b1, sum);
sum = _mm256_fmadd_pd(a2, b2, sum);
sum = _mm256_fmadd_pd(a3, b3, sum);
// asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
_mm256_storeu_pd(&C[4*i], sum);
}
}
-O3 -xcore-avx2
ICC Test`RunIntrinsics_FMA_UnalignedCopy_Struct:
0x1000053c0 <+0>: pushq %rbp
0x1000053c1 <+1>: movq %rsp, %rbp
0x1000053c4 <+4>: andq $-0x20, %rsp
0x1000053c8 <+8>: pushq %r12
0x1000053ca <+10>: pushq %r13
0x1000053cc <+12>: pushq %r14
0x1000053ce <+14>: pushq %r15
0x1000053d0 <+16>: pushq %rbx
0x1000053d1 <+17>: subq $0x4f8, %rsp ; imm = 0x4F8
0x1000053d8 <+24>: callq 0x10000b538 ; symbol stub for: clock
0x1000053dd <+29>: movq %rax, %rbx
0x1000053e0 <+32>: vmovupd 0x95f8(%rip), %ymm11 ; SourceMatrix + 190
0x1000053e8 <+40>: xorl %eax, %eax
0x1000053ea <+42>: vxorpd %xmm1, %xmm1, %xmm1
0x1000053ee <+46>: vmovsd %xmm1, 0x40(%rsp)
0x1000053f4 <+52>: vmovupd 0x9604(%rip), %ymm10 ; SourceMatrix + 222
0x1000053fc <+60>: vmovupd 0x95bc(%rip), %ymm12 ; SourceMatrix + 158
0x100005404 <+68>: vmovupd 0x9574(%rip), %ymm13 ; SourceMatrix + 94
0x10000540c <+76>: vmovupd 0x952c(%rip), %ymm14 ; SourceMatrix + 30
0x100005414 <+84>: vmovupd 0x9504(%rip), %ymm15 ; c_feature_names + 446
0x10000541c <+92>: vmovupd 0x95fc(%rip), %ymm8 ; SourceMatrix + 254
0x100005424 <+100>: vmovupd 0x9614(%rip), %ymm7 ; SourceMatrix + 286
0x10000542c <+108>: vmovupd 0x962c(%rip), %ymm6 ; SourceMatrix + 318
0x100005434 <+116>: vmovupd 0x9644(%rip), %ymm5 ; SourceMatrix + 350
0x10000543c <+124>: vmovupd 0x965c(%rip), %ymm3 ; SourceMatrix + 382
0x100005444 <+132>: vmovupd 0x9674(%rip), %ymm2 ; SourceMatrix + 414
0x10000544c <+140>: vmovupd 0x968c(%rip), %ymm1 ; SourceMatrix + 446
0x100005454 <+148>: vmovupd %ymm10, 0x420(%rsp)
0x10000545d <+157>: vmovupd %ymm11, 0x440(%rsp)
0x100005466 <+166>: vmovsd 0x626a(%rip), %xmm10 ; xmm10 = mem[0],zero
0x10000546e <+174>: vmovsd 0x40(%rsp), %xmm11 ; xmm11 = mem[0],zero
0x100005474 <+180>: vmovupd 0x94e4(%rip), %ymm9 ; SourceMatrix + 62
0x10000547c <+188>: vmovupd %ymm1, 0x20(%rsp)
0x100005482 <+194>: vmovupd 0x9516(%rip), %ymm4 ; SourceMatrix + 126
0x10000548a <+202>: vmovupd %ymm2, 0x360(%rsp)
0x100005493 <+211>: vmovupd %ymm3, 0x3c0(%rsp)
0x10000549c <+220>: vmovupd %ymm5, 0x380(%rsp)
0x1000054a5 <+229>: vmovupd %ymm6, 0x3a0(%rsp)
0x1000054ae <+238>: vmovupd %ymm7, 0x3e0(%rsp)
0x1000054b7 <+247>: vmovupd %ymm8, 0x400(%rsp)
0x1000054c0 <+256>: vmovupd %ymm12, 0x4c0(%rsp)
0x1000054c9 <+265>: vmovupd %ymm13, 0x4a0(%rsp)
0x1000054d2 <+274>: vmovupd %ymm14, 0x480(%rsp)
0x1000054db <+283>: vmovupd %ymm15, 0x460(%rsp)
0x1000054e4 <+292>: vxorpd %ymm0, %ymm0, %ymm0
0x1000054e8 <+296>: vmovupd %ymm0, 0x260(%rsp)
0x1000054f1 <+305>: vmovupd %ymm0, 0x2e0(%rsp)
0x1000054fa <+314>: vmovupd %ymm0, 0x280(%rsp)
0x100005503 <+323>: vmovupd %ymm0, 0x300(%rsp)
0x10000550c <+332>: vmovupd %ymm0, 0x2a0(%rsp)
0x100005515 <+341>: vmovupd %ymm0, 0x320(%rsp)
0x10000551e <+350>: vmovupd %ymm0, 0x2c0(%rsp)
0x100005527 <+359>: vmovupd %ymm0, 0x340(%rsp)
0x100005530 <+368>: vmovupd 0x95c8(%rip), %ymm0 ; SourceMatrix + 478
0x100005538 <+376>: vmovupd %ymm0, (%rsp)
0x10000553d <+381>: incl %eax
0x10000553f <+383>: vxorpd %xmm3, %xmm3, %xmm3
0x100005543 <+387>: vcvtsi2sdl %eax, %xmm3, %xmm3
0x100005547 <+391>: vdivsd %xmm3, %xmm10, %xmm2
0x10000554b <+395>: vbroadcastsd %xmm2, %ymm8
0x100005550 <+400>: vaddpd 0x460(%rsp), %ymm8, %ymm1
0x100005559 <+409>: vaddpd %ymm4, %ymm8, %ymm3
0x10000555d <+413>: vaddpd 0x480(%rsp), %ymm8, %ymm0
0x100005566 <+422>: vaddpd 0x420(%rsp), %ymm8, %ymm2
0x10000556f <+431>: vaddpd %ymm9, %ymm8, %ymm6
0x100005574 <+436>: vaddpd 0x4a0(%rsp), %ymm8, %ymm7
0x10000557d <+445>: vaddpd 0x400(%rsp), %ymm8, %ymm5
0x100005586 <+454>: vmovupd %ymm1, 0x60(%rsp)
0x10000558c <+460>: vmovupd %ymm0, 0x80(%rsp)
0x100005595 <+469>: vmovupd %ymm6, 0xa0(%rsp)
0x10000559e <+478>: vmovupd %ymm7, 0xc0(%rsp)
0x1000055a7 <+487>: vmovupd %ymm3, 0xe0(%rsp)
0x1000055b0 <+496>: vmovupd %ymm5, 0x160(%rsp)
0x1000055b9 <+505>: vmovupd %ymm2, 0x140(%rsp)
0x1000055c2 <+514>: vbroadcastsd 0x60(%rsp), %ymm14
0x1000055c9 <+521>: vbroadcastsd 0x68(%rsp), %ymm13
0x1000055d0 <+528>: vbroadcastsd 0x70(%rsp), %ymm15
0x1000055d7 <+535>: vbroadcastsd 0x78(%rsp), %ymm12
0x1000055de <+542>: vmulpd %ymm14, %ymm3, %ymm14
0x1000055e3 <+547>: vaddpd 0x4c0(%rsp), %ymm8, %ymm1
0x1000055ec <+556>: vaddpd 0x440(%rsp), %ymm8, %ymm0
0x1000055f5 <+565>: vaddpd 0x380(%rsp), %ymm8, %ymm5
0x1000055fe <+574>: vaddpd 0x3e0(%rsp), %ymm8, %ymm6
0x100005607 <+583>: vaddpd 0x3a0(%rsp), %ymm8, %ymm7
0x100005610 <+592>: vfmadd213pd %ymm14, %ymm1, %ymm13
0x100005615 <+597>: vmovupd %ymm5, 0x1c0(%rsp)
0x10000561e <+606>: vmovupd %ymm1, 0x100(%rsp)
0x100005627 <+615>: vmovupd %ymm6, 0x180(%rsp)
0x100005630 <+624>: vmovupd %ymm7, 0x1a0(%rsp)
0x100005639 <+633>: vfmadd213pd %ymm13, %ymm0, %ymm15
0x10000563e <+638>: vmovupd %ymm0, 0x120(%rsp)
0x100005647 <+647>: vbroadcastsd 0x88(%rsp), %ymm13
0x100005651 <+657>: vbroadcastsd 0x90(%rsp), %ymm14
0x10000565b <+667>: vfmadd213pd %ymm15, %ymm2, %ymm12
0x100005660 <+672>: vbroadcastsd 0x80(%rsp), %ymm15
0x10000566a <+682>: vaddpd 0x3c0(%rsp), %ymm8, %ymm5
0x100005673 <+691>: vaddpd 0x360(%rsp), %ymm8, %ymm7
0x10000567c <+700>: vaddpd 0x20(%rsp), %ymm8, %ymm6
0x100005682 <+706>: vaddpd (%rsp), %ymm8, %ymm8
0x100005687 <+711>: vmulpd %ymm15, %ymm3, %ymm15
-> 0x10000568c <+716>: vmovupd %ymm12, 0x260(%rsp)
0x100005695 <+725>: vmovupd %ymm5, 0x1e0(%rsp)
0x10000569e <+734>: vmovupd %ymm8, 0x240(%rsp)
0x1000056a7 <+743>: vmovupd %ymm6, 0x220(%rsp)
0x1000056b0 <+752>: vfmadd213pd %ymm15, %ymm1, %ymm13
0x1000056b5 <+757>: vmovupd %ymm7, 0x200(%rsp)
编辑:由于空间限制而在删除之前使用asm命令工作程序集。
修改 以下代码在gcc.godbolt.org上与ICC 18编译:
0x100004c10 <+0>: pushq %rbp
0x100004c11 <+1>: movq %rsp, %rbp
0x100004c14 <+4>: andq $-0x20, %rsp
0x100004c18 <+8>: pushq %r12
0x100004c1a <+10>: pushq %r13
0x100004c1c <+12>: pushq %r14
0x100004c1e <+14>: pushq %r15
0x100004c20 <+16>: pushq %rbx
0x100004c21 <+17>: subq $0x858, %rsp ; imm = 0x858
0x100004c28 <+24>: callq 0x10000b538 ; symbol stub for: clock
0x100004c2d <+29>: movq %rax, %rbx
0x100004c30 <+32>: vbroadcastsd 0xc0(%rsp), %ymm15
0x100004c3a <+42>: xorl %eax, %eax
0x100004c3c <+44>: vxorpd %xmm1, %xmm1, %xmm1
0x100004c40 <+48>: vmovsd %xmm1, 0x40(%rsp)
0x100004c46 <+54>: vmovupd 0x9c12(%rip), %ymm2 ; c_feature_names + 446
0x100004c4e <+62>: vmovupd %ymm15, 0x620(%rsp)
0x100004c57 <+71>: vmovupd 0x9c41(%rip), %ymm13 ; SourceMatrix + 62
0x100004c5f <+79>: vmovupd 0x9c19(%rip), %ymm14 ; SourceMatrix + 30
0x100004c67 <+87>: vmovupd 0x9c51(%rip), %ymm12 ; SourceMatrix + 94
0x100004c6f <+95>: vmovupd %ymm2, 0x640(%rsp)
0x100004c78 <+104>: vmovupd 0x9c60(%rip), %ymm11 ; SourceMatrix + 126
0x100004c80 <+112>: vmovupd 0x9c78(%rip), %ymm10 ; SourceMatrix + 158
0x100004c88 <+120>: vmovupd 0x9c90(%rip), %ymm9 ; SourceMatrix + 190
0x100004c90 <+128>: vmovupd %ymm13, 0x680(%rsp)
0x100004c99 <+137>: vmovupd 0x9c9f(%rip), %ymm8 ; SourceMatrix + 222
0x100004ca1 <+145>: vmovupd 0x9cb7(%rip), %ymm7 ; SourceMatrix + 254
0x100004ca9 <+153>: vmovupd 0x9ccf(%rip), %ymm6 ; SourceMatrix + 286
0x100004cb1 <+161>: vmovupd %ymm9, 0x700(%rsp)
0x100004cba <+170>: vmovupd 0x9cde(%rip), %ymm5 ; SourceMatrix + 318
0x100004cc2 <+178>: vmovupd 0x9cf6(%rip), %ymm4 ; SourceMatrix + 350
0x100004cca <+186>: vmovupd 0x9d0e(%rip), %ymm3 ; SourceMatrix + 382
0x100004cd2 <+194>: vmovupd %ymm6, 0x760(%rsp)
0x100004cdb <+203>: vmovupd 0x9d1d(%rip), %ymm2 ; SourceMatrix + 414
0x100004ce3 <+211>: vmovupd 0x9d35(%rip), %ymm1 ; SourceMatrix + 446
0x100004ceb <+219>: vmovsd 0x40(%rsp), %xmm13 ; xmm13 = mem[0],zero
0x100004cf1 <+225>: vbroadcastsd 0xc8(%rsp), %ymm15
0x100004cfb <+235>: vmovupd %ymm3, 0x7c0(%rsp)
0x100004d04 <+244>: vmovupd %ymm2, 0x7e0(%rsp)
0x100004d0d <+253>: vmovupd %ymm1, 0x800(%rsp)
0x100004d16 <+262>: vmovupd %ymm15, 0x600(%rsp)
0x100004d1f <+271>: vmovupd %ymm4, 0x7a0(%rsp)
0x100004d28 <+280>: vmovupd %ymm5, 0x780(%rsp)
0x100004d31 <+289>: vmovupd %ymm7, 0x740(%rsp)
0x100004d3a <+298>: vmovupd %ymm8, 0x720(%rsp)
0x100004d43 <+307>: vmovupd %ymm10, 0x6e0(%rsp)
0x100004d4c <+316>: vmovupd %ymm11, 0x6c0(%rsp)
0x100004d55 <+325>: vmovupd %ymm12, 0x6a0(%rsp)
0x100004d5e <+334>: vmovupd %ymm14, 0x660(%rsp)
0x100004d67 <+343>: vbroadcastsd 0xd0(%rsp), %ymm15
0x100004d71 <+353>: vmovupd %ymm15, 0x5e0(%rsp)
0x100004d7a <+362>: vbroadcastsd 0xd8(%rsp), %ymm15
0x100004d84 <+372>: vmovupd %ymm15, 0x5c0(%rsp)
0x100004d8d <+381>: vbroadcastsd 0xe0(%rsp), %ymm15
0x100004d97 <+391>: vmovupd %ymm15, 0x5a0(%rsp)
0x100004da0 <+400>: vbroadcastsd 0xe8(%rsp), %ymm15
0x100004daa <+410>: vmovupd %ymm15, 0x580(%rsp)
0x100004db3 <+419>: vbroadcastsd 0xf0(%rsp), %ymm15
0x100004dbd <+429>: vmovupd %ymm15, 0x560(%rsp)
0x100004dc6 <+438>: vbroadcastsd 0xf8(%rsp), %ymm15
0x100004dd0 <+448>: vmovupd %ymm15, 0x540(%rsp)
0x100004dd9 <+457>: vbroadcastsd 0x100(%rsp), %ymm15
0x100004de3 <+467>: vmovupd %ymm15, 0x520(%rsp)
0x100004dec <+476>: vbroadcastsd 0x108(%rsp), %ymm15
0x100004df6 <+486>: vmovupd %ymm15, 0x500(%rsp)
0x100004dff <+495>: vbroadcastsd 0x110(%rsp), %ymm15
0x100004e09 <+505>: vmovupd %ymm15, 0x4e0(%rsp)
0x100004e12 <+514>: vbroadcastsd 0x118(%rsp), %ymm15
0x100004e1c <+524>: vmovupd %ymm15, 0x4c0(%rsp)
0x100004e25 <+533>: vbroadcastsd 0x1c0(%rsp), %ymm15
0x100004e2f <+543>: vmovupd %ymm15, 0x4a0(%rsp)
0x100004e38 <+552>: vbroadcastsd 0x1c8(%rsp), %ymm15
0x100004e42 <+562>: vmovupd %ymm15, 0x480(%rsp)
0x100004e4b <+571>: vbroadcastsd 0x1d0(%rsp), %ymm15
0x100004e55 <+581>: vmovupd %ymm15, 0x460(%rsp)
0x100004e5e <+590>: vbroadcastsd 0x1d8(%rsp), %ymm15
0x100004e68 <+600>: vmovupd %ymm15, 0x440(%rsp)
0x100004e71 <+609>: vbroadcastsd 0x1e0(%rsp), %ymm15
0x100004e7b <+619>: vmovupd %ymm15, 0x420(%rsp)
0x100004e84 <+628>: vbroadcastsd 0x1e8(%rsp), %ymm15
0x100004e8e <+638>: vmovupd %ymm15, 0x400(%rsp)
0x100004e97 <+647>: vbroadcastsd 0x1f0(%rsp), %ymm15
0x100004ea1 <+657>: vmovupd %ymm15, 0x3e0(%rsp)
0x100004eaa <+666>: vbroadcastsd 0x1f8(%rsp), %ymm15
0x100004eb4 <+676>: vmovupd %ymm15, 0x3c0(%rsp)
0x100004ebd <+685>: vbroadcastsd 0x200(%rsp), %ymm15
0x100004ec7 <+695>: vmovupd %ymm15, 0x3a0(%rsp)
0x100004ed0 <+704>: vbroadcastsd 0x208(%rsp), %ymm15
0x100004eda <+714>: vmovupd %ymm15, 0x80(%rsp)
0x100004ee3 <+723>: vbroadcastsd 0x210(%rsp), %ymm15
0x100004eed <+733>: vxorpd %ymm0, %ymm0, %ymm0
0x100004ef1 <+737>: vmovupd %ymm0, 0x2a0(%rsp)
0x100004efa <+746>: vmovupd %ymm0, 0x320(%rsp)
0x100004f03 <+755>: vmovupd %ymm0, 0x2c0(%rsp)
0x100004f0c <+764>: vmovupd %ymm0, 0x340(%rsp)
0x100004f15 <+773>: vmovupd %ymm0, 0x2e0(%rsp)
0x100004f1e <+782>: vmovupd %ymm0, 0x360(%rsp)
0x100004f27 <+791>: vmovupd %ymm0, 0x300(%rsp)
0x100004f30 <+800>: vmovupd %ymm0, 0x380(%rsp)
0x100004f39 <+809>: vmovupd 0x9aff(%rip), %ymm0 ; SourceMatrix + 478
0x100004f41 <+817>: vmovupd %ymm15, 0x60(%rsp)
0x100004f47 <+823>: vbroadcastsd 0x218(%rsp), %ymm15
0x100004f51 <+833>: vmovupd %ymm0, 0x820(%rsp)
0x100004f5a <+842>: vmovupd %ymm15, 0x20(%rsp)
0x100004f60 <+848>: incl %eax
0x100004f62 <+850>: vxorpd %xmm12, %xmm12, %xmm12
0x100004f67 <+855>: vcvtsi2sdl %eax, %xmm12, %xmm12
0x100004f6b <+859>: vmovsd 0x6765(%rip), %xmm11 ; xmm11 = mem[0],zero
0x100004f73 <+867>: vdivsd %xmm12, %xmm11, %xmm8
0x100004f78 <+872>: vbroadcastsd %xmm8, %ymm7
0x100004f7d <+877>: vaddpd 0x640(%rsp), %ymm7, %ymm9
0x100004f86 <+886>: vaddpd 0x6c0(%rsp), %ymm7, %ymm0
0x100004f8f <+895>: vaddpd 0x6e0(%rsp), %ymm7, %ymm1
0x100004f98 <+904>: vaddpd 0x700(%rsp), %ymm7, %ymm2
0x100004fa1 <+913>: vaddpd 0x720(%rsp), %ymm7, %ymm3
0x100004faa <+922>: vaddpd 0x740(%rsp), %ymm7, %ymm8
0x100004fb3 <+931>: vaddpd 0x7c0(%rsp), %ymm7, %ymm4
0x100004fbc <+940>: vaddpd 0x7e0(%rsp), %ymm7, %ymm5
0x100004fc5 <+949>: vaddpd 0x800(%rsp), %ymm7, %ymm6
0x100004fce <+958>: vaddpd 0x660(%rsp), %ymm7, %ymm10
0x100004fd7 <+967>: vaddpd 0x680(%rsp), %ymm7, %ymm12
0x100004fe0 <+976>: vaddpd 0x6a0(%rsp), %ymm7, %ymm11
0x100004fe9 <+985>: vmovupd %ymm9, 0xa0(%rsp)
0x100004ff2 <+994>: vmovupd %ymm0, 0x120(%rsp)
0x100004ffb <+1003>: vmovupd %ymm8, 0x1a0(%rsp)
0x100005004 <+1012>: vmovupd %ymm3, 0x180(%rsp)
0x10000500d <+1021>: vmovupd %ymm1, 0x140(%rsp)
0x100005016 <+1030>: vmovupd %ymm2, 0x160(%rsp)
0x10000501f <+1039>: vmovupd %ymm10, (%rsp)
0x100005024 <+1044>: vmovupd %ymm4, 0x220(%rsp)
0x10000502d <+1053>: vmovupd %ymm5, 0x240(%rsp)
0x100005036 <+1062>: vmovupd %ymm6, 0x260(%rsp)
0x10000503f <+1071>: vbroadcastsd 0xa0(%rsp), %ymm14
0x100005049 <+1081>: vbroadcastsd 0xa8(%rsp), %ymm15
0x100005053 <+1091>: vaddpd 0x760(%rsp), %ymm7, %ymm10
0x10000505c <+1100>: vaddpd 0x780(%rsp), %ymm7, %ymm9
0x100005065 <+1109>: vaddpd 0x7a0(%rsp), %ymm7, %ymm8
0x10000506e <+1118>: vaddpd 0x820(%rsp), %ymm7, %ymm7
0x100005077 <+1127>: vmulpd %ymm14, %ymm0, %ymm14
0x10000507c <+1132>: vmovupd %ymm7, 0x280(%rsp)
0x100005085 <+1141>: vfmadd213pd %ymm14, %ymm1, %ymm15
0x10000508a <+1146>: vbroadcastsd 0xb0(%rsp), %ymm14
0x100005094 <+1156>: vfmadd213pd %ymm15, %ymm2, %ymm14
0x100005099 <+1161>: vbroadcastsd 0xb8(%rsp), %ymm15
0x1000050a3 <+1171>: vfmadd213pd %ymm14, %ymm3, %ymm15
0x1000050a8 <+1176>: vmulpd 0x620(%rsp), %ymm0, %ymm14
-> 0x1000050b1 <+1185>: vmovupd %ymm15, 0x2a0(%rsp)
0x1000050ba <+1194>: vmulpd 0x5a0(%rsp), %ymm0, %ymm15
0x1000050c3 <+1203>: vmulpd 0x520(%rsp), %ymm0, %ymm0
0x1000050cc <+1212>: vfmadd231pd 0x600(%rsp), %ymm1, %ymm14
0x1000050d6 <+1222>: vfmadd231pd 0x580(%rsp), %ymm1, %ymm15
0x1000050e0 <+1232>: vfmadd231pd 0x500(%rsp), %ymm1, %ymm0
答案 0 :(得分:1)
我注意到的一件事是你将FPTYPE*
传递给实际上是多维数组的乘法函数。
也许英特尔编译器不太喜欢这个?
为了更好地理解你的代码,我做了一些C构造的C ++,我的代码现在将const结构引用传递给乘法函数。
我没有英特尔编译器的许可证,但也许您可以检查代码现在是否适用于-O3
:
#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <immintrin.h>
constexpr int N = 4;
// Power factor tells us how many matrices need to be multiplied.
// For the standard Wilson action, this is 4.
// For the first improvement, 6.
// But the relative runtime ration is independent of this.
constexpr int POWER_FACTOR = 4;
constexpr int ITERATIONS = 10 * 1000 * 1000;
constexpr bool GENERATE_NEW_RANDOMS = false;
typedef double FP_TYPE;
struct Matrix
{
FP_TYPE m[N][N] __attribute__ ((aligned (32)));
};
typedef void (*multiply_method)(const Matrix&, const Matrix&, Matrix&);
Matrix source_matrices[POWER_FACTOR];
FP_TYPE random (FP_TYPE min, FP_TYPE max);
void randomize_source_matrices ();
void test_run (multiply_method method, const std::string &method_name);
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c);
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c);
FP_TYPE random (FP_TYPE min, FP_TYPE max)
{
return min + (max - min) * FP_TYPE(rand()) / FP_TYPE(RAND_MAX);
}
void randomize_source_matrices ()
{
// Assign random numbers to imaginary and real parts
for (int j = 0; j < N; j++)
{
for (int k = 0; k < N; k++)
{
for (int i = 0; i < POWER_FACTOR; i++)
{
source_matrices[i].m[j][k] = random(-1.0, 1.0);
}
}
}
}
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c)
{
for (int j = 0; j < N; j++)
{
for (int k = 0; k < N; k++)
{
c.m[j][k] = 0.0;
for (int i = 0; i < N; i++)
{
c.m[j][k] += a.m[j][i] * b.m[i][k];
}
}
}
}
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c)
{
//__m256d *B256 = (__m256d *) B;
//__m256d *C256 = (__m256d *) C;
// load values from B
__m256d b0 = _mm256_loadu_pd (&b.m[0][0]);
__m256d b1 = _mm256_loadu_pd (&b.m[1][0]);
__m256d b2 = _mm256_loadu_pd (&b.m[2][0]);
__m256d b3 = _mm256_loadu_pd (&b.m[3][0]);
for (size_t i = 0; i < 4; i++)
{
// load values from A
__m256d a0 = _mm256_set1_pd (a.m[i][0]);
__m256d a1 = _mm256_set1_pd (a.m[i][1]);
__m256d a2 = _mm256_set1_pd (a.m[i][2]);
__m256d a3 = _mm256_set1_pd (a.m[i][3]);
__m256d sum;
sum = _mm256_mul_pd (a0, b0);
sum = _mm256_fmadd_pd (a1, b1, sum);
sum = _mm256_fmadd_pd (a2, b2, sum);
sum = _mm256_fmadd_pd (a3, b3, sum);
// asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
_mm256_storeu_pd(&c.m[i][0], sum);
}
}
void test_run (multiply_method method, const std::string &method_name)
{
clock_t timer = clock ();
Matrix matrix_dummy1 = {0};
Matrix matrix_dummy2 = {0};
Matrix matrix_dummy3 = {0};
Matrix matrices[POWER_FACTOR];
FP_TYPE trace = 0.0;
// Read source matrices in own data format
// We do the whole process ITERATIONS times to get less error for the runtime .
for (int n = 0; n < ITERATIONS; n++)
{
for (int j = 0; j < N; j++)
{
for (int k = 0; k < N; k++)
{
for (int i = 0; i < POWER_FACTOR; i++)
{
if (GENERATE_NEW_RANDOMS)
{
matrices[i].m[j][k] = random (-1.0, 1.0);
}
else
{
matrices[i].m[j][k] = source_matrices[i].m[j][k] + 1.0 / (double)(n + 1);
}
}
}
}
method (matrices[0], matrices[1], matrix_dummy1);
method (matrices[2], matrices[3], matrix_dummy2);
method (matrix_dummy1, matrix_dummy2, matrix_dummy3);
for (int j = 0; j < N; j++)
{
trace += matrix_dummy3.m[j][j];
}
}
std::cout << std::setprecision(15);
std::cout << "Trace " << method_name << " = \t";
std::cout << trace / (double) ITERATIONS;
std::cout << " took ";
std::cout << (double) (clock() - timer) / CLOCKS_PER_SEC << "s\n\n";
}
int main ()
{
std::cout << "Beginning computation\n\n";
randomize_source_matrices ();
test_run (multiply_plain, "For Point");
test_run (multiply_intrinsics, "Intrinsics");
}
它有点慢,因为我将两个测试函数合并为一个并删除了该过程中的内联指令。
(如果您愿意容忍某些代码重复,那么将其添加回来应该没有问题。)
此代码仍有一些危险的事情,例如它只适用于N = 4
。在生产中使用此类代码之前,请务必添加一些静态断言或一些类似的安全措施。
另一件事是仍然有一些C风格(double)
演员阵容喷入其中,但我认为这只是因为它是测试代码。我也不确定代码是否适用于不同的FP_TYPE(之前从未使用内在函数...)。
为了完整起见,这是一个进一步改进的版本:
#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <vector>
#include <immintrin.h>
using FP_TYPE = double;
constexpr size_t N = 4;
// Power factor tells us how many matrices need to be multiplied.
// For the standard Wilson action, this is 4.
// For the first improvement, 6.
// But the relative runtime ration is independent of this.
constexpr size_t POWER_FACTOR = 4;
constexpr size_t ITERATIONS = 10 * 1000 * 1000;
constexpr bool GENERATE_NEW_RANDOMS = false;
struct Matrix
{
FP_TYPE m[N][N] __attribute__ ((aligned (32))) = ;
};
using multiply_func = void (*) (const Matrix&, const Matrix&, Matrix&);
using set_func = FP_TYPE (*) ();
using transform_func = FP_TYPE (*) (FP_TYPE value);
FP_TYPE random (FP_TYPE min, FP_TYPE max);
void randomize_matrix (Matrix &matrix);
void test_run (const std::vector<Matrix> &source_matrices,
const multiply_func &func,
const std::string &func_name);
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c);
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c);
void set_each_matrix_value (Matrix &matrix, const set_func &func);
void init_matrix (const Matrix &source_matrix, Matrix &matrix,
size_t iteration);
FP_TYPE random (FP_TYPE min, FP_TYPE max)
{
return min + (max - min) * FP_TYPE(rand()) / FP_TYPE(RAND_MAX);
}
void set_each_matrix_value (Matrix &matrix, const set_func &func)
{
for (auto &j : matrix.m)
{
for (auto &k : j)
{
k = func ();
}
}
}
void randomize_matrix (Matrix &matrix)
{
// Assign random numbers to imaginary and real parts
set_each_matrix_value (matrix, [] ()
{
return random(-1.0, 1.0);
});
}
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c)
{
for (size_t j = 0; j < N; j++)
{
for (size_t k = 0; k < N; k++)
{
auto &val = c.m[j][k];
val = 0.0;
for (size_t i = 0; i < N; i++)
{
val += a.m[j][i] * b.m[i][k];
}
}
}
}
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c)
{
static_assert (N == 4);
static_assert (sizeof (FP_TYPE) == 8);
static_assert (N * sizeof(FP_TYPE) == 256 / 8);
// In addition the array in Matrix.m must be properly aligned
//__m256d *B256 = (__m256d *) B;
//__m256d *C256 = (__m256d *) C;
// load values from B
__m256d b0 = _mm256_loadu_pd (&b.m[0][0]);
__m256d b1 = _mm256_loadu_pd (&b.m[1][0]);
__m256d b2 = _mm256_loadu_pd (&b.m[2][0]);
__m256d b3 = _mm256_loadu_pd (&b.m[3][0]);
for (size_t i = 0; i < 4; i++)
{
// load values from A
__m256d a0 = _mm256_set1_pd (a.m[i][0]);
__m256d a1 = _mm256_set1_pd (a.m[i][1]);
__m256d a2 = _mm256_set1_pd (a.m[i][2]);
__m256d a3 = _mm256_set1_pd (a.m[i][3]);
__m256d sum;
sum = _mm256_mul_pd (a0, b0);
sum = _mm256_fmadd_pd (a1, b1, sum);
sum = _mm256_fmadd_pd (a2, b2, sum);
sum = _mm256_fmadd_pd (a3, b3, sum);
// asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
_mm256_storeu_pd(&c.m[i][0], sum);
}
}
void init_matrix (const Matrix &source_matrix, Matrix &matrix, size_t iteration)
{
for (size_t j = 0; j < N; j++)
{
for (size_t k = 0; k < N; k++)
{
matrix.m[j][k] = source_matrix.m[j][k] + 1.0 / static_cast<FP_TYPE>
(iteration + 1);
}
}
}
void test_run (const std::vector<Matrix> &source_matrices,
const multiply_func &func, const std::string &func_name)
{
clock_t timer = clock ();
Matrix matrix_dummy1;
Matrix matrix_dummy2;
Matrix matrix_dummy3;
std::vector<Matrix> matrices (POWER_FACTOR);
FP_TYPE trace = 0.0;
// Read source matrices in own data format
// We do the whole process ITERATIONS times to get less error for the runtime .
for (size_t n = 0; n < ITERATIONS; n++)
{
if constexpr (GENERATE_NEW_RANDOMS)
{
for (auto &matrix : matrices)
{
randomize_matrix (matrix);
}
}
else
{
for (size_t i = 0; i < POWER_FACTOR; i++)
{
init_matrix (source_matrices[i], matrices[i], n);
}
}
func (matrices[0], matrices[1], matrix_dummy1);
func (matrices[2], matrices[3], matrix_dummy2);
func (matrix_dummy1, matrix_dummy2, matrix_dummy3);
for (size_t j = 0; j < N; j++)
{
trace += matrix_dummy3.m[j][j];
}
}
std::cout << std::setprecision(15);
std::cout << "Trace " << func_name << " = \t";
std::cout << trace / static_cast<FP_TYPE> (ITERATIONS);
std::cout << " took ";
std::cout << static_cast<double> (clock() - timer) / CLOCKS_PER_SEC << "s\n";
std::cout << std::endl;
}
int main ()
{
std::vector<Matrix> source_matrices (POWER_FACTOR);
std::cout << "Beginning computation\n";
std::cout << std::endl;
for (auto &matrix : source_matrices)
{
randomize_matrix (matrix);
}
test_run (source_matrices, multiply_plain, "For Point");
test_run (source_matrices, multiply_intrinsics, "Intrinsics");
}
BTW:要使用g ++或clang ++进行编译,你必须添加-march=haswell
(或者你拥有的任何CPU)。