ICC中的-O3扰乱内在函数,使用-O1或-O2或相应的手动装配

时间:2018-04-16 13:51:17

标签: c++ assembly optimization intrinsics icc

这是对this question的跟进。

下面的代码用于4x4矩阵乘法C = AB在所有优化设置上在ICC上编译良好。它在-O1和-O2上正确执行,但在-O3上给出了错误的结果。问题似乎来自_mm256_storeu_pd操作,因为用下面的asm语句替换它(并且只有它)会在执行后给出正确的结果。有什么想法吗?

      "styles": [
    "../node_modules/vis/dist/vis-network.min.css",
    "styles.css",
    "../node_modules/codemirror/lib/codemirror.css",
    "../node_modules/rm-emoji-picker/dist/emojipicker.css"
  ],
  "scripts": [
    "../node_modules/jquery/dist/jquery.js",
    "../node_modules/codemirror/lib/codemirror.js",
    "../node_modules/codemirror/mode/python/python.js",
    "../node_modules/rm-emoji-picker/src/js/EmojiPicker.js"
  ],

此外,这是由ICC生成的程序集。箭头分别表示带有_mm256_storeu_pd或asm语句的行。 RunIntrinsics_FMA_UnalignedCopy_Struct是一个从SourceMatrix获取存储数字并调用矩阵乘法例程的函数。

-O2 -xcore-avx2

ICC Test`RunIntrinsics_FMA_UnalignedCopy_Struct:

inline void RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(double *A, double *B, double *C)
{
    size_t i;

    /* the registers you use */
    __m256d a0, a1, a2, a3, b0, b1, b2, b3, sum;
    //  __m256d *C256 = (__m256d *)C;

    /* load values from B */
    b0 = _mm256_loadu_pd(&B[0]);
    b1 = _mm256_loadu_pd(&B[4]);
    b2 = _mm256_loadu_pd(&B[8]);
    b3 = _mm256_loadu_pd(&B[12]);

    for (i = 0; i < 4; i++) {
        /* load values from A */
        a0 = _mm256_set1_pd(A[4*i + 0]);
        a1 = _mm256_set1_pd(A[4*i + 1]);
        a2 = _mm256_set1_pd(A[4*i + 2]);
        a3 = _mm256_set1_pd(A[4*i + 3]);

        sum = _mm256_mul_pd(a0, b0);
        sum = _mm256_fmadd_pd(a1, b1, sum);
        sum = _mm256_fmadd_pd(a2, b2, sum);
        sum = _mm256_fmadd_pd(a3, b3, sum);

        // asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
        _mm256_storeu_pd(&C[4*i], sum);

    }

}

-O3 -xcore-avx2

ICC Test`RunIntrinsics_FMA_UnalignedCopy_Struct:

0x1000053c0 <+0>:    pushq  %rbp
0x1000053c1 <+1>:    movq   %rsp, %rbp
0x1000053c4 <+4>:    andq   $-0x20, %rsp
0x1000053c8 <+8>:    pushq  %r12
0x1000053ca <+10>:   pushq  %r13
0x1000053cc <+12>:   pushq  %r14
0x1000053ce <+14>:   pushq  %r15
0x1000053d0 <+16>:   pushq  %rbx
0x1000053d1 <+17>:   subq   $0x4f8, %rsp              ; imm = 0x4F8 
0x1000053d8 <+24>:   callq  0x10000b538               ; symbol stub for: clock
0x1000053dd <+29>:   movq   %rax, %rbx
0x1000053e0 <+32>:   vmovupd 0x95f8(%rip), %ymm11      ; SourceMatrix + 190
0x1000053e8 <+40>:   xorl   %eax, %eax
0x1000053ea <+42>:   vxorpd %xmm1, %xmm1, %xmm1
0x1000053ee <+46>:   vmovsd %xmm1, 0x40(%rsp)
0x1000053f4 <+52>:   vmovupd 0x9604(%rip), %ymm10      ; SourceMatrix + 222
0x1000053fc <+60>:   vmovupd 0x95bc(%rip), %ymm12      ; SourceMatrix + 158
0x100005404 <+68>:   vmovupd 0x9574(%rip), %ymm13      ; SourceMatrix + 94
0x10000540c <+76>:   vmovupd 0x952c(%rip), %ymm14      ; SourceMatrix + 30
0x100005414 <+84>:   vmovupd 0x9504(%rip), %ymm15      ; c_feature_names + 446
0x10000541c <+92>:   vmovupd 0x95fc(%rip), %ymm8       ; SourceMatrix + 254
0x100005424 <+100>:  vmovupd 0x9614(%rip), %ymm7       ; SourceMatrix + 286
0x10000542c <+108>:  vmovupd 0x962c(%rip), %ymm6       ; SourceMatrix + 318
0x100005434 <+116>:  vmovupd 0x9644(%rip), %ymm5       ; SourceMatrix + 350
0x10000543c <+124>:  vmovupd 0x965c(%rip), %ymm3       ; SourceMatrix + 382
0x100005444 <+132>:  vmovupd 0x9674(%rip), %ymm2       ; SourceMatrix + 414
0x10000544c <+140>:  vmovupd 0x968c(%rip), %ymm1       ; SourceMatrix + 446
0x100005454 <+148>:  vmovupd %ymm10, 0x420(%rsp)
0x10000545d <+157>:  vmovupd %ymm11, 0x440(%rsp)
0x100005466 <+166>:  vmovsd 0x626a(%rip), %xmm10      ; xmm10 = mem[0],zero 
0x10000546e <+174>:  vmovsd 0x40(%rsp), %xmm11        ; xmm11 = mem[0],zero 
0x100005474 <+180>:  vmovupd 0x94e4(%rip), %ymm9       ; SourceMatrix + 62
0x10000547c <+188>:  vmovupd %ymm1, 0x20(%rsp)
0x100005482 <+194>:  vmovupd 0x9516(%rip), %ymm4       ; SourceMatrix + 126
0x10000548a <+202>:  vmovupd %ymm2, 0x360(%rsp)
0x100005493 <+211>:  vmovupd %ymm3, 0x3c0(%rsp)
0x10000549c <+220>:  vmovupd %ymm5, 0x380(%rsp)
0x1000054a5 <+229>:  vmovupd %ymm6, 0x3a0(%rsp)
0x1000054ae <+238>:  vmovupd %ymm7, 0x3e0(%rsp)
0x1000054b7 <+247>:  vmovupd %ymm8, 0x400(%rsp)
0x1000054c0 <+256>:  vmovupd %ymm12, 0x4c0(%rsp)
0x1000054c9 <+265>:  vmovupd %ymm13, 0x4a0(%rsp)
0x1000054d2 <+274>:  vmovupd %ymm14, 0x480(%rsp)
0x1000054db <+283>:  vmovupd %ymm15, 0x460(%rsp)
0x1000054e4 <+292>:  vxorpd %ymm0, %ymm0, %ymm0
0x1000054e8 <+296>:  vmovupd %ymm0, 0x260(%rsp)
0x1000054f1 <+305>:  vmovupd %ymm0, 0x2e0(%rsp)
0x1000054fa <+314>:  vmovupd %ymm0, 0x280(%rsp)
0x100005503 <+323>:  vmovupd %ymm0, 0x300(%rsp)
0x10000550c <+332>:  vmovupd %ymm0, 0x2a0(%rsp)
0x100005515 <+341>:  vmovupd %ymm0, 0x320(%rsp)
0x10000551e <+350>:  vmovupd %ymm0, 0x2c0(%rsp)
0x100005527 <+359>:  vmovupd %ymm0, 0x340(%rsp)
0x100005530 <+368>:  vmovupd 0x95c8(%rip), %ymm0       ; SourceMatrix + 478
0x100005538 <+376>:  vmovupd %ymm0, (%rsp)
0x10000553d <+381>:  incl   %eax
0x10000553f <+383>:  vxorpd %xmm3, %xmm3, %xmm3
0x100005543 <+387>:  vcvtsi2sdl %eax, %xmm3, %xmm3
0x100005547 <+391>:  vdivsd %xmm3, %xmm10, %xmm2
0x10000554b <+395>:  vbroadcastsd %xmm2, %ymm8
0x100005550 <+400>:  vaddpd 0x460(%rsp), %ymm8, %ymm1
0x100005559 <+409>:  vaddpd %ymm4, %ymm8, %ymm3
0x10000555d <+413>:  vaddpd 0x480(%rsp), %ymm8, %ymm0
0x100005566 <+422>:  vaddpd 0x420(%rsp), %ymm8, %ymm2
0x10000556f <+431>:  vaddpd %ymm9, %ymm8, %ymm6
0x100005574 <+436>:  vaddpd 0x4a0(%rsp), %ymm8, %ymm7
0x10000557d <+445>:  vaddpd 0x400(%rsp), %ymm8, %ymm5
0x100005586 <+454>:  vmovupd %ymm1, 0x60(%rsp)
0x10000558c <+460>:  vmovupd %ymm0, 0x80(%rsp)
0x100005595 <+469>:  vmovupd %ymm6, 0xa0(%rsp)
0x10000559e <+478>:  vmovupd %ymm7, 0xc0(%rsp)
0x1000055a7 <+487>:  vmovupd %ymm3, 0xe0(%rsp)
0x1000055b0 <+496>:  vmovupd %ymm5, 0x160(%rsp)
0x1000055b9 <+505>:  vmovupd %ymm2, 0x140(%rsp)
0x1000055c2 <+514>:  vbroadcastsd 0x60(%rsp), %ymm14
0x1000055c9 <+521>:  vbroadcastsd 0x68(%rsp), %ymm13
0x1000055d0 <+528>:  vbroadcastsd 0x70(%rsp), %ymm15
0x1000055d7 <+535>:  vbroadcastsd 0x78(%rsp), %ymm12
0x1000055de <+542>:  vmulpd %ymm14, %ymm3, %ymm14
0x1000055e3 <+547>:  vaddpd 0x4c0(%rsp), %ymm8, %ymm1
0x1000055ec <+556>:  vaddpd 0x440(%rsp), %ymm8, %ymm0
0x1000055f5 <+565>:  vaddpd 0x380(%rsp), %ymm8, %ymm5
0x1000055fe <+574>:  vaddpd 0x3e0(%rsp), %ymm8, %ymm6
0x100005607 <+583>:  vaddpd 0x3a0(%rsp), %ymm8, %ymm7
0x100005610 <+592>:  vfmadd213pd %ymm14, %ymm1, %ymm13
0x100005615 <+597>:  vmovupd %ymm5, 0x1c0(%rsp)
0x10000561e <+606>:  vmovupd %ymm1, 0x100(%rsp)
0x100005627 <+615>:  vmovupd %ymm6, 0x180(%rsp)
0x100005630 <+624>:  vmovupd %ymm7, 0x1a0(%rsp)
0x100005639 <+633>:  vfmadd213pd %ymm13, %ymm0, %ymm15
0x10000563e <+638>:  vmovupd %ymm0, 0x120(%rsp)
0x100005647 <+647>:  vbroadcastsd 0x88(%rsp), %ymm13
0x100005651 <+657>:  vbroadcastsd 0x90(%rsp), %ymm14
0x10000565b <+667>:  vfmadd213pd %ymm15, %ymm2, %ymm12
0x100005660 <+672>:  vbroadcastsd 0x80(%rsp), %ymm15
0x10000566a <+682>:  vaddpd 0x3c0(%rsp), %ymm8, %ymm5
0x100005673 <+691>:  vaddpd 0x360(%rsp), %ymm8, %ymm7
0x10000567c <+700>:  vaddpd 0x20(%rsp), %ymm8, %ymm6
0x100005682 <+706>:  vaddpd (%rsp), %ymm8, %ymm8
0x100005687 <+711>:  vmulpd %ymm15, %ymm3, %ymm15
->  0x10000568c <+716>:  vmovupd %ymm12, 0x260(%rsp)
0x100005695 <+725>:  vmovupd %ymm5, 0x1e0(%rsp)
0x10000569e <+734>:  vmovupd %ymm8, 0x240(%rsp)
0x1000056a7 <+743>:  vmovupd %ymm6, 0x220(%rsp)
0x1000056b0 <+752>:  vfmadd213pd %ymm15, %ymm1, %ymm13
0x1000056b5 <+757>:  vmovupd %ymm7, 0x200(%rsp)

编辑:由于空间限制而在删除之前使用asm命令工作程序集。

修改 以下代码在gcc.godbolt.org上与ICC 18编译:

0x100004c10 <+0>:    pushq  %rbp
0x100004c11 <+1>:    movq   %rsp, %rbp
0x100004c14 <+4>:    andq   $-0x20, %rsp
0x100004c18 <+8>:    pushq  %r12
0x100004c1a <+10>:   pushq  %r13
0x100004c1c <+12>:   pushq  %r14
0x100004c1e <+14>:   pushq  %r15
0x100004c20 <+16>:   pushq  %rbx
0x100004c21 <+17>:   subq   $0x858, %rsp              ; imm = 0x858 
0x100004c28 <+24>:   callq  0x10000b538               ; symbol stub for: clock
0x100004c2d <+29>:   movq   %rax, %rbx
0x100004c30 <+32>:   vbroadcastsd 0xc0(%rsp), %ymm15
0x100004c3a <+42>:   xorl   %eax, %eax
0x100004c3c <+44>:   vxorpd %xmm1, %xmm1, %xmm1
0x100004c40 <+48>:   vmovsd %xmm1, 0x40(%rsp)
0x100004c46 <+54>:   vmovupd 0x9c12(%rip), %ymm2       ; c_feature_names + 446
0x100004c4e <+62>:   vmovupd %ymm15, 0x620(%rsp)
0x100004c57 <+71>:   vmovupd 0x9c41(%rip), %ymm13      ; SourceMatrix + 62
0x100004c5f <+79>:   vmovupd 0x9c19(%rip), %ymm14      ; SourceMatrix + 30
0x100004c67 <+87>:   vmovupd 0x9c51(%rip), %ymm12      ; SourceMatrix + 94
0x100004c6f <+95>:   vmovupd %ymm2, 0x640(%rsp)
0x100004c78 <+104>:  vmovupd 0x9c60(%rip), %ymm11      ; SourceMatrix + 126
0x100004c80 <+112>:  vmovupd 0x9c78(%rip), %ymm10      ; SourceMatrix + 158
0x100004c88 <+120>:  vmovupd 0x9c90(%rip), %ymm9       ; SourceMatrix + 190
0x100004c90 <+128>:  vmovupd %ymm13, 0x680(%rsp)
0x100004c99 <+137>:  vmovupd 0x9c9f(%rip), %ymm8       ; SourceMatrix + 222
0x100004ca1 <+145>:  vmovupd 0x9cb7(%rip), %ymm7       ; SourceMatrix + 254
0x100004ca9 <+153>:  vmovupd 0x9ccf(%rip), %ymm6       ; SourceMatrix + 286
0x100004cb1 <+161>:  vmovupd %ymm9, 0x700(%rsp)
0x100004cba <+170>:  vmovupd 0x9cde(%rip), %ymm5       ; SourceMatrix + 318
0x100004cc2 <+178>:  vmovupd 0x9cf6(%rip), %ymm4       ; SourceMatrix + 350
0x100004cca <+186>:  vmovupd 0x9d0e(%rip), %ymm3       ; SourceMatrix + 382
0x100004cd2 <+194>:  vmovupd %ymm6, 0x760(%rsp)
0x100004cdb <+203>:  vmovupd 0x9d1d(%rip), %ymm2       ; SourceMatrix + 414
0x100004ce3 <+211>:  vmovupd 0x9d35(%rip), %ymm1       ; SourceMatrix + 446
0x100004ceb <+219>:  vmovsd 0x40(%rsp), %xmm13        ; xmm13 = mem[0],zero 
0x100004cf1 <+225>:  vbroadcastsd 0xc8(%rsp), %ymm15
0x100004cfb <+235>:  vmovupd %ymm3, 0x7c0(%rsp)
0x100004d04 <+244>:  vmovupd %ymm2, 0x7e0(%rsp)
0x100004d0d <+253>:  vmovupd %ymm1, 0x800(%rsp)
0x100004d16 <+262>:  vmovupd %ymm15, 0x600(%rsp)
0x100004d1f <+271>:  vmovupd %ymm4, 0x7a0(%rsp)
0x100004d28 <+280>:  vmovupd %ymm5, 0x780(%rsp)
0x100004d31 <+289>:  vmovupd %ymm7, 0x740(%rsp)
0x100004d3a <+298>:  vmovupd %ymm8, 0x720(%rsp)
0x100004d43 <+307>:  vmovupd %ymm10, 0x6e0(%rsp)
0x100004d4c <+316>:  vmovupd %ymm11, 0x6c0(%rsp)
0x100004d55 <+325>:  vmovupd %ymm12, 0x6a0(%rsp)
0x100004d5e <+334>:  vmovupd %ymm14, 0x660(%rsp)
0x100004d67 <+343>:  vbroadcastsd 0xd0(%rsp), %ymm15
0x100004d71 <+353>:  vmovupd %ymm15, 0x5e0(%rsp)
0x100004d7a <+362>:  vbroadcastsd 0xd8(%rsp), %ymm15
0x100004d84 <+372>:  vmovupd %ymm15, 0x5c0(%rsp)
0x100004d8d <+381>:  vbroadcastsd 0xe0(%rsp), %ymm15
0x100004d97 <+391>:  vmovupd %ymm15, 0x5a0(%rsp)
0x100004da0 <+400>:  vbroadcastsd 0xe8(%rsp), %ymm15
0x100004daa <+410>:  vmovupd %ymm15, 0x580(%rsp)
0x100004db3 <+419>:  vbroadcastsd 0xf0(%rsp), %ymm15
0x100004dbd <+429>:  vmovupd %ymm15, 0x560(%rsp)
0x100004dc6 <+438>:  vbroadcastsd 0xf8(%rsp), %ymm15
0x100004dd0 <+448>:  vmovupd %ymm15, 0x540(%rsp)
0x100004dd9 <+457>:  vbroadcastsd 0x100(%rsp), %ymm15
0x100004de3 <+467>:  vmovupd %ymm15, 0x520(%rsp)
0x100004dec <+476>:  vbroadcastsd 0x108(%rsp), %ymm15
0x100004df6 <+486>:  vmovupd %ymm15, 0x500(%rsp)
0x100004dff <+495>:  vbroadcastsd 0x110(%rsp), %ymm15
0x100004e09 <+505>:  vmovupd %ymm15, 0x4e0(%rsp)
0x100004e12 <+514>:  vbroadcastsd 0x118(%rsp), %ymm15
0x100004e1c <+524>:  vmovupd %ymm15, 0x4c0(%rsp)
0x100004e25 <+533>:  vbroadcastsd 0x1c0(%rsp), %ymm15
0x100004e2f <+543>:  vmovupd %ymm15, 0x4a0(%rsp)
0x100004e38 <+552>:  vbroadcastsd 0x1c8(%rsp), %ymm15
0x100004e42 <+562>:  vmovupd %ymm15, 0x480(%rsp)
0x100004e4b <+571>:  vbroadcastsd 0x1d0(%rsp), %ymm15
0x100004e55 <+581>:  vmovupd %ymm15, 0x460(%rsp)
0x100004e5e <+590>:  vbroadcastsd 0x1d8(%rsp), %ymm15
0x100004e68 <+600>:  vmovupd %ymm15, 0x440(%rsp)
0x100004e71 <+609>:  vbroadcastsd 0x1e0(%rsp), %ymm15
0x100004e7b <+619>:  vmovupd %ymm15, 0x420(%rsp)
0x100004e84 <+628>:  vbroadcastsd 0x1e8(%rsp), %ymm15
0x100004e8e <+638>:  vmovupd %ymm15, 0x400(%rsp)
0x100004e97 <+647>:  vbroadcastsd 0x1f0(%rsp), %ymm15
0x100004ea1 <+657>:  vmovupd %ymm15, 0x3e0(%rsp)
0x100004eaa <+666>:  vbroadcastsd 0x1f8(%rsp), %ymm15
0x100004eb4 <+676>:  vmovupd %ymm15, 0x3c0(%rsp)
0x100004ebd <+685>:  vbroadcastsd 0x200(%rsp), %ymm15
0x100004ec7 <+695>:  vmovupd %ymm15, 0x3a0(%rsp)
0x100004ed0 <+704>:  vbroadcastsd 0x208(%rsp), %ymm15
0x100004eda <+714>:  vmovupd %ymm15, 0x80(%rsp)
0x100004ee3 <+723>:  vbroadcastsd 0x210(%rsp), %ymm15
0x100004eed <+733>:  vxorpd %ymm0, %ymm0, %ymm0
0x100004ef1 <+737>:  vmovupd %ymm0, 0x2a0(%rsp)
0x100004efa <+746>:  vmovupd %ymm0, 0x320(%rsp)
0x100004f03 <+755>:  vmovupd %ymm0, 0x2c0(%rsp)
0x100004f0c <+764>:  vmovupd %ymm0, 0x340(%rsp)
0x100004f15 <+773>:  vmovupd %ymm0, 0x2e0(%rsp)
0x100004f1e <+782>:  vmovupd %ymm0, 0x360(%rsp)
0x100004f27 <+791>:  vmovupd %ymm0, 0x300(%rsp)
0x100004f30 <+800>:  vmovupd %ymm0, 0x380(%rsp)
0x100004f39 <+809>:  vmovupd 0x9aff(%rip), %ymm0       ; SourceMatrix + 478
0x100004f41 <+817>:  vmovupd %ymm15, 0x60(%rsp)
0x100004f47 <+823>:  vbroadcastsd 0x218(%rsp), %ymm15
0x100004f51 <+833>:  vmovupd %ymm0, 0x820(%rsp)
0x100004f5a <+842>:  vmovupd %ymm15, 0x20(%rsp)
0x100004f60 <+848>:  incl   %eax
0x100004f62 <+850>:  vxorpd %xmm12, %xmm12, %xmm12
0x100004f67 <+855>:  vcvtsi2sdl %eax, %xmm12, %xmm12
0x100004f6b <+859>:  vmovsd 0x6765(%rip), %xmm11      ; xmm11 = mem[0],zero 
0x100004f73 <+867>:  vdivsd %xmm12, %xmm11, %xmm8
0x100004f78 <+872>:  vbroadcastsd %xmm8, %ymm7
0x100004f7d <+877>:  vaddpd 0x640(%rsp), %ymm7, %ymm9
0x100004f86 <+886>:  vaddpd 0x6c0(%rsp), %ymm7, %ymm0
0x100004f8f <+895>:  vaddpd 0x6e0(%rsp), %ymm7, %ymm1
0x100004f98 <+904>:  vaddpd 0x700(%rsp), %ymm7, %ymm2
0x100004fa1 <+913>:  vaddpd 0x720(%rsp), %ymm7, %ymm3
0x100004faa <+922>:  vaddpd 0x740(%rsp), %ymm7, %ymm8
0x100004fb3 <+931>:  vaddpd 0x7c0(%rsp), %ymm7, %ymm4
0x100004fbc <+940>:  vaddpd 0x7e0(%rsp), %ymm7, %ymm5
0x100004fc5 <+949>:  vaddpd 0x800(%rsp), %ymm7, %ymm6
0x100004fce <+958>:  vaddpd 0x660(%rsp), %ymm7, %ymm10
0x100004fd7 <+967>:  vaddpd 0x680(%rsp), %ymm7, %ymm12
0x100004fe0 <+976>:  vaddpd 0x6a0(%rsp), %ymm7, %ymm11
0x100004fe9 <+985>:  vmovupd %ymm9, 0xa0(%rsp)
0x100004ff2 <+994>:  vmovupd %ymm0, 0x120(%rsp)
0x100004ffb <+1003>: vmovupd %ymm8, 0x1a0(%rsp)
0x100005004 <+1012>: vmovupd %ymm3, 0x180(%rsp)
0x10000500d <+1021>: vmovupd %ymm1, 0x140(%rsp)
0x100005016 <+1030>: vmovupd %ymm2, 0x160(%rsp)
0x10000501f <+1039>: vmovupd %ymm10, (%rsp)
0x100005024 <+1044>: vmovupd %ymm4, 0x220(%rsp)
0x10000502d <+1053>: vmovupd %ymm5, 0x240(%rsp)
0x100005036 <+1062>: vmovupd %ymm6, 0x260(%rsp)
0x10000503f <+1071>: vbroadcastsd 0xa0(%rsp), %ymm14
0x100005049 <+1081>: vbroadcastsd 0xa8(%rsp), %ymm15
0x100005053 <+1091>: vaddpd 0x760(%rsp), %ymm7, %ymm10
0x10000505c <+1100>: vaddpd 0x780(%rsp), %ymm7, %ymm9
0x100005065 <+1109>: vaddpd 0x7a0(%rsp), %ymm7, %ymm8
0x10000506e <+1118>: vaddpd 0x820(%rsp), %ymm7, %ymm7
0x100005077 <+1127>: vmulpd %ymm14, %ymm0, %ymm14
0x10000507c <+1132>: vmovupd %ymm7, 0x280(%rsp)
0x100005085 <+1141>: vfmadd213pd %ymm14, %ymm1, %ymm15
0x10000508a <+1146>: vbroadcastsd 0xb0(%rsp), %ymm14
0x100005094 <+1156>: vfmadd213pd %ymm15, %ymm2, %ymm14
0x100005099 <+1161>: vbroadcastsd 0xb8(%rsp), %ymm15
0x1000050a3 <+1171>: vfmadd213pd %ymm14, %ymm3, %ymm15
0x1000050a8 <+1176>: vmulpd 0x620(%rsp), %ymm0, %ymm14
->  0x1000050b1 <+1185>: vmovupd %ymm15, 0x2a0(%rsp)
0x1000050ba <+1194>: vmulpd 0x5a0(%rsp), %ymm0, %ymm15
0x1000050c3 <+1203>: vmulpd 0x520(%rsp), %ymm0, %ymm0
0x1000050cc <+1212>: vfmadd231pd 0x600(%rsp), %ymm1, %ymm14
0x1000050d6 <+1222>: vfmadd231pd 0x580(%rsp), %ymm1, %ymm15
0x1000050e0 <+1232>: vfmadd231pd 0x500(%rsp), %ymm1, %ymm0

1 个答案:

答案 0 :(得分:1)

我注意到的一件事是你将FPTYPE*传递给实际上是多维数组的乘法函数。

也许英特尔编译器不太喜欢这个?

为了更好地理解你的代码,我做了一些C构造的C ++,我的代码现在将const结构引用传递给乘法函数。

我没有英特尔编译器的许可证,但也许您可以检查代码现在是否适用于-O3

#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <immintrin.h>

constexpr int N = 4;
// Power factor tells us how many matrices need to be multiplied.
// For the standard Wilson action, this is 4.
// For the first improvement, 6.
// But the relative runtime ration is independent of this.
constexpr int POWER_FACTOR = 4;
constexpr int ITERATIONS = 10 * 1000 * 1000;
constexpr bool GENERATE_NEW_RANDOMS = false;

typedef double FP_TYPE;

struct Matrix
{
  FP_TYPE m[N][N] __attribute__ ((aligned (32)));
};

typedef void (*multiply_method)(const Matrix&, const Matrix&, Matrix&);

Matrix source_matrices[POWER_FACTOR];

FP_TYPE random (FP_TYPE min, FP_TYPE max);
void randomize_source_matrices ();
void test_run (multiply_method method, const std::string &method_name);
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c);
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c);

FP_TYPE random (FP_TYPE min, FP_TYPE max)
{
  return min + (max - min) * FP_TYPE(rand()) / FP_TYPE(RAND_MAX);
}

void randomize_source_matrices ()
{
  // Assign random numbers to imaginary and real parts
  for (int j = 0; j < N; j++)
    {
      for (int k = 0; k < N; k++)
        {
          for (int i = 0; i < POWER_FACTOR; i++)
            {
              source_matrices[i].m[j][k] = random(-1.0, 1.0);
            }
        }
    }
}

void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c)
{
  for (int j = 0; j < N; j++)
    {
      for (int k = 0; k < N; k++)
        {
          c.m[j][k] = 0.0;
          for (int i = 0; i < N; i++)
            {
              c.m[j][k] += a.m[j][i] * b.m[i][k];
            }
        }
    }
}

void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c)
{
  //__m256d *B256 = (__m256d *) B;
  //__m256d *C256 = (__m256d *) C;

  // load values from B
  __m256d b0 = _mm256_loadu_pd (&b.m[0][0]);
  __m256d b1 = _mm256_loadu_pd (&b.m[1][0]);
  __m256d b2 = _mm256_loadu_pd (&b.m[2][0]);
  __m256d b3 = _mm256_loadu_pd (&b.m[3][0]);

  for (size_t i = 0; i < 4; i++)
    {
      // load values from A
      __m256d a0 = _mm256_set1_pd (a.m[i][0]);
      __m256d a1 = _mm256_set1_pd (a.m[i][1]);
      __m256d a2 = _mm256_set1_pd (a.m[i][2]);
      __m256d a3 = _mm256_set1_pd (a.m[i][3]);

      __m256d sum;
      sum = _mm256_mul_pd (a0, b0);
      sum = _mm256_fmadd_pd (a1, b1, sum);
      sum = _mm256_fmadd_pd (a2, b2, sum);
      sum = _mm256_fmadd_pd (a3, b3, sum);

      //   asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
      _mm256_storeu_pd(&c.m[i][0], sum);
    }
}

void test_run (multiply_method method, const std::string &method_name)
{
  clock_t timer = clock ();

  Matrix matrix_dummy1 = {0};
  Matrix matrix_dummy2 = {0};
  Matrix matrix_dummy3 = {0};

  Matrix matrices[POWER_FACTOR];
  FP_TYPE trace = 0.0;

  // Read source matrices in own data format
  // We do the whole process ITERATIONS times to get less error for the runtime .
  for (int n = 0; n < ITERATIONS; n++)
    {
      for (int j = 0; j < N; j++)
        {
          for (int k = 0; k < N; k++)
            {
              for (int i = 0; i < POWER_FACTOR; i++)
                {
                  if (GENERATE_NEW_RANDOMS)
                    {
                      matrices[i].m[j][k] = random (-1.0, 1.0);
                    }
                  else
                    {
                      matrices[i].m[j][k] = source_matrices[i].m[j][k] + 1.0 / (double)(n + 1);
                    }
                }
            }
        }
      method (matrices[0], matrices[1], matrix_dummy1);
      method (matrices[2], matrices[3], matrix_dummy2);
      method (matrix_dummy1, matrix_dummy2, matrix_dummy3);

      for (int j = 0; j < N; j++)
        {
          trace += matrix_dummy3.m[j][j];
        }
    }
  std::cout << std::setprecision(15);
  std::cout << "Trace " << method_name << " = \t";
  std::cout << trace / (double) ITERATIONS;
  std::cout << "    took ";
  std::cout << (double) (clock() - timer) / CLOCKS_PER_SEC << "s\n\n";
}

int main ()
{
  std::cout << "Beginning computation\n\n";
  randomize_source_matrices ();
  test_run (multiply_plain, "For Point");
  test_run (multiply_intrinsics, "Intrinsics");
}

它有点慢,因为我将两个测试函数合并为一个并删除了该过程中的内联指令。

(如果您愿意容忍某些代码重复,那么将其添加回来应该没有问题。)

此代码仍有一些危险的事情,例如它只适用于N = 4。在生产中使用此类代码之前,请务必添加一些静态断言或一些类似的安全措施。

另一件事是仍然有一些C风格(double)演员阵容喷入其中,但我认为这只是因为它是测试代码。我也不确定代码是否适用于不同的FP_TYPE(之前从未使用内在函数...)。

为了完整起见,这是一个进一步改进的版本:

#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <vector>
#include <immintrin.h>

using FP_TYPE = double;

constexpr size_t N = 4;
// Power factor tells us how many matrices need to be multiplied.
// For the standard Wilson action, this is 4.
// For the first improvement, 6.
// But the relative runtime ration is independent of this.
constexpr size_t POWER_FACTOR = 4;
constexpr size_t ITERATIONS = 10 * 1000 * 1000;
constexpr bool GENERATE_NEW_RANDOMS = false;

struct Matrix
{
  FP_TYPE m[N][N] __attribute__ ((aligned (32))) = Picture from output;
};

using multiply_func = void (*) (const Matrix&, const Matrix&, Matrix&);
using set_func = FP_TYPE (*) ();
using transform_func = FP_TYPE (*) (FP_TYPE value);

FP_TYPE random (FP_TYPE min, FP_TYPE max);
void randomize_matrix (Matrix &matrix);
void test_run (const std::vector<Matrix> &source_matrices,
               const multiply_func &func,
               const std::string &func_name);
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c);
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c);
void set_each_matrix_value (Matrix &matrix, const set_func &func);
void init_matrix (const Matrix &source_matrix, Matrix &matrix,
                  size_t iteration);

FP_TYPE random (FP_TYPE min, FP_TYPE max)
{
  return min + (max - min) * FP_TYPE(rand()) / FP_TYPE(RAND_MAX);
}

void set_each_matrix_value (Matrix &matrix, const set_func &func)
{
  for (auto &j : matrix.m)
    {
      for (auto &k : j)
        {
          k = func ();
        }
    }
}

void randomize_matrix (Matrix &matrix)
{
  // Assign random numbers to imaginary and real parts
  set_each_matrix_value (matrix, [] ()
  {
    return random(-1.0, 1.0);
  });
}

void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c)
{
  for (size_t j = 0; j < N; j++)
    {
      for (size_t k = 0; k < N; k++)
        {
          auto &val = c.m[j][k];
          val = 0.0;
          for (size_t i = 0; i < N; i++)
            {
              val += a.m[j][i] * b.m[i][k];
            }
        }
    }
}

void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c)
{
  static_assert (N == 4);
  static_assert (sizeof (FP_TYPE) == 8);
  static_assert (N * sizeof(FP_TYPE) == 256 / 8);
  // In addition the array in Matrix.m must be properly aligned

  //__m256d *B256 = (__m256d *) B;
  //__m256d *C256 = (__m256d *) C;

  // load values from B
  __m256d b0 = _mm256_loadu_pd (&b.m[0][0]);
  __m256d b1 = _mm256_loadu_pd (&b.m[1][0]);
  __m256d b2 = _mm256_loadu_pd (&b.m[2][0]);
  __m256d b3 = _mm256_loadu_pd (&b.m[3][0]);

  for (size_t i = 0; i < 4; i++)
    {
      // load values from A
      __m256d a0 = _mm256_set1_pd (a.m[i][0]);
      __m256d a1 = _mm256_set1_pd (a.m[i][1]);
      __m256d a2 = _mm256_set1_pd (a.m[i][2]);
      __m256d a3 = _mm256_set1_pd (a.m[i][3]);

      __m256d sum;
      sum = _mm256_mul_pd (a0, b0);
      sum = _mm256_fmadd_pd (a1, b1, sum);
      sum = _mm256_fmadd_pd (a2, b2, sum);
      sum = _mm256_fmadd_pd (a3, b3, sum);

      //   asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
      _mm256_storeu_pd(&c.m[i][0], sum);
    }
}

void init_matrix (const Matrix &source_matrix, Matrix &matrix, size_t iteration)
{
  for (size_t j = 0; j < N; j++)
    {
      for (size_t k = 0; k < N; k++)
        {
          matrix.m[j][k] = source_matrix.m[j][k] + 1.0 / static_cast<FP_TYPE>
                           (iteration + 1);
        }
    }
}

void test_run (const std::vector<Matrix> &source_matrices,
               const multiply_func &func, const std::string &func_name)
{
  clock_t timer = clock ();

  Matrix matrix_dummy1;
  Matrix matrix_dummy2;
  Matrix matrix_dummy3;

  std::vector<Matrix> matrices (POWER_FACTOR);
  FP_TYPE trace = 0.0;

  // Read source matrices in own data format
  // We do the whole process ITERATIONS times to get less error for the runtime .
  for (size_t n = 0; n < ITERATIONS; n++)
    {
      if constexpr (GENERATE_NEW_RANDOMS)
        {
          for (auto &matrix : matrices)
            {
              randomize_matrix (matrix);
            }
        }
      else
        {
          for (size_t i = 0; i < POWER_FACTOR; i++)
            {
              init_matrix (source_matrices[i], matrices[i], n);
            }
        }

      func (matrices[0], matrices[1], matrix_dummy1);
      func (matrices[2], matrices[3], matrix_dummy2);
      func (matrix_dummy1, matrix_dummy2, matrix_dummy3);

      for (size_t j = 0; j < N; j++)
        {
          trace += matrix_dummy3.m[j][j];
        }
    }
  std::cout << std::setprecision(15);
  std::cout << "Trace " << func_name << " = \t";
  std::cout << trace / static_cast<FP_TYPE> (ITERATIONS);
  std::cout << "    took ";
  std::cout << static_cast<double> (clock() - timer) / CLOCKS_PER_SEC << "s\n";
  std::cout << std::endl;
}

int main ()
{
  std::vector<Matrix> source_matrices (POWER_FACTOR);
  std::cout << "Beginning computation\n";
  std::cout << std::endl;
  for (auto &matrix : source_matrices)
    {
      randomize_matrix (matrix);
    }
  test_run (source_matrices, multiply_plain, "For Point");
  test_run (source_matrices, multiply_intrinsics, "Intrinsics");
}

BTW:要使用g ++或clang ++进行编译,你必须添加-march=haswell(或者你拥有的任何CPU)。