我正在学习如何在OpenMP / Fortran中使用SIMD指令。一世 写了简单的代码:
program loop
implicit none
integer :: i,j
real*8 :: x
x = 0.0
do i=1,10000
do j=1,10000000
x = x + 1.0/(1.0*i)
enddo
enddo
print*, x
end program loop
当我编译此代码并运行它时,我得到:
ifort -O3 -vec-report3 -xhost loop_simd.f90
loop_simd.f90(10): (col. 12) remark: LOOP WAS VECTORIZED
loop_simd.f90(9): (col. 7) remark: loop was not vectorized: not inner loop
time ./a.out
97876060.8355515
real 0m8.940s
user 0m8.937s
sys 0m0.005s
我做了编译器建议的关于"不是内循环"和 添加了SIMD collapse(2)指令:
program loop
implicit none
integer :: i,j
real*8 :: x
x = 0.0
!$omp simd collapse(2) reduction(+:x)
do i=1,10000
do j=1,10000000
x = x + 1.0/(1.0*i)
enddo
enddo
print*, x
end program loop
然后我再次编译并运行代码,我得到了以下内容 输出:
ifort -O3 -vec-report3 -openmp -xhost loop_simd.f90
loop_simd.f90(8): (col. 7) remark: OpenMP SIMD LOOP WAS VECTORIZED
time ./a.out
97876054.9903757
real 0m26.535s
user 0m26.540s
sys 0m0.003s
我不知道为什么SIMD会降低性能? 什么时候SIMD会比标准的Fortran代码更好?
.section .text
.LNDBG_TX:
# mark_description "Intel(R) Fortran Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 14.0.2.144 Build 2";
# mark_description "0140120";
# mark_description "-O3 -vec-report3 -openmp -xhost -S";
.file "loop_simd.f90"
.text
..TXTST0:
L__routine_start_MAIN___0:
# -- Begin MAIN__
# mark_begin;
.align 16,0x90
.globl MAIN__
MAIN__:
..B1.1: # Preds ..B1.0
..___tag_value_MAIN__.1: #1.9
..LN0:
.file 1 "loop_simd.f90"
.loc 1 1 is_stmt 1
pushq %rbp #1.9
..___tag_value_MAIN__.3: #
..LN1:
movq %rsp, %rbp #1.9
..___tag_value_MAIN__.4: #
..LN2:
andq $-128, %rsp #1.9
..LN3:
subq $128, %rsp #1.9
..LN4:
movq $0x0000117fe, %rsi #1.9
..LN5:
movl $3, %edi #1.9
..LN6:
call __intel_new_feature_proc_init #1.9
..LN7:
# LOE rbx r12 r13 r14 r15
..B1.12: # Preds ..B1.1
..LN8:
vstmxcsr (%rsp) #1.9
..LN9:
movl $.2.3_2_kmpc_loc_struct_pack.1, %edi #1.9
..LN10:
xorl %esi, %esi #1.9
..LN11:
orl $32832, (%rsp) #1.9
..LN12:
xorl %eax, %eax #1.9
..LN13:
vldmxcsr (%rsp) #1.9
..___tag_value_MAIN__.6: #1.9
..LN14:
call __kmpc_begin #1.9
..___tag_value_MAIN__.7: #
..LN15:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.12
..LN16:
movl $__NLITPACK_0.0.1, %edi #1.9
..LN17:
call for_set_reentrancy #1.9
..LN18:
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
..LN19:
.loc 1 8 is_stmt 1
movl $4, %eax #8.7
..LN20:
.loc 1 6 is_stmt 1
vxorpd %ymm2, %ymm2, %ymm2 #6.7
..LN21:
.loc 1 8 is_stmt 1
vmovd %eax, %xmm0 #8.7
..LN22:
xorl %eax, %eax #8.7
..LN23:
vpshufd $0, %xmm0, %xmm1 #8.7
..LN24:
vmovdqu .L_2il0floatpacket.19(%rip), %xmm0 #8.7
..LN25:
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm1 ymm2
..B1.4: # Preds ..B1.6 ..B1.3
..LN26:
.loc 1 11 is_stmt 1
vcvtdq2ps %xmm0, %xmm3 #11.34
..LN27:
vrcpps %xmm3, %xmm5 #11.28
..LN28:
vmulps %xmm3, %xmm5, %xmm4 #11.28
..LN29:
vaddps %xmm5, %xmm5, %xmm6 #11.28
..LN30:
vmulps %xmm5, %xmm4, %xmm7 #11.28
..LN31:
.loc 1 10 is_stmt 1
xorl %edx, %edx #10.12
..LN32:
.loc 1 11 is_stmt 1
vsubps %xmm7, %xmm6, %xmm8 #11.28
..LN33:
vcvtps2pd %xmm8, %ymm3 #11.28
..LN34:
# LOE rbx r12 r13 r14 r15 eax edx xmm0 xmm1 ymm2 ymm3
..B1.5: # Preds ..B1.5 ..B1.4
..LN35:
.loc 1 10 is_stmt 1
incl %edx #10.12
..LN36:
.loc 1 11 is_stmt 1
vaddpd %ymm3, %ymm2, %ymm2 #11.17
..LN37:
.loc 1 10 is_stmt 1
cmpl $10000000, %edx #10.12
..LN38:
jb ..B1.5 # Prob 99% #10.12
..LN39:
# LOE rbx r12 r13 r14 r15 eax edx xmm0 xmm1 ymm2 ymm3
..B1.6: # Preds ..B1.5
..LN40:
.loc 1 8 is_stmt 1
addl $4, %eax #8.7
..LN41:
.loc 1 10 is_stmt 1
vpaddd %xmm1, %xmm0, %xmm0 #10.12
..LN42:
.loc 1 8 is_stmt 1
cmpl $10000, %eax #8.7
..LN43:
jb ..B1.4 # Prob 66% #8.7
..LN44:
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm1 ymm2
..B1.7: # Preds ..B1.6
..LN45:
.loc 1 6 is_stmt 1
..LN46:
.loc 1 15 is_stmt 1
lea (%rsp), %rdi #15.7
..LN47:
.loc 1 6 is_stmt 1
vextractf128 $1, %ymm2, %xmm0 #6.7
..LN48:
.loc 1 15 is_stmt 1
movl $-1, %esi #15.7
..LN49:
.loc 1 6 is_stmt 1
vaddpd %xmm0, %xmm2, %xmm1 #6.7
..LN50:
vunpckhpd %xmm1, %xmm1, %xmm3 #6.7
..LN51:
.loc 1 15 is_stmt 1
lea 64(%rsp), %r8 #15.7
..LN52:
movq $0x1208384ff00, %rdx #15.7
..LN53:
movl $__STRLITPACK_0.0.1, %ecx #15.7
..LN54:
xorl %eax, %eax #15.7
..LN55:
.loc 1 6 is_stmt 1
vaddsd %xmm3, %xmm1, %xmm4 #6.7
..LN56:
.loc 1 15 is_stmt 1
vmovsd %xmm4, 64(%rsp) #15.7
..LN57:
movq $0, (%rsp) #15.7
..LN58:
vzeroupper #15.7
..LN59:
call for_write_seq_lis #15.7
..LN60:
# LOE rbx r12 r13 r14 r15
..B1.8: # Preds ..B1.7
..LN61:
.loc 1 18 is_stmt 1
movl $.2.3_2_kmpc_loc_struct_pack.12, %edi #18.1
..LN62:
xorl %eax, %eax #18.1
..___tag_value_MAIN__.8: #18.1
..LN63:
call __kmpc_end #18.1
..___tag_value_MAIN__.9: #
..LN64:
# LOE rbx r12 r13 r14 r15
..B1.9: # Preds ..B1.8
..LN65:
movl $1, %eax #18.1
..LN66:
movq %rbp, %rsp #18.1
..LN67:
popq %rbp #18.1
..___tag_value_MAIN__.10: #
..LN68:
ret #18.1
.align 16,0x90
..___tag_value_MAIN__.12: #
..LN69:
# LOE
..LN70:
# mark_end;
.type MAIN__,@function
.size MAIN__,.-MAIN__
..LNMAIN__.71:
.LNMAIN__:
.data
.align 4
.align 4
.2.3_2_kmpc_loc_struct_pack.1:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.0
.align 4
.2.3_2__kmpc_loc_pack.0:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 77
.byte 65
.byte 73
.byte 78
.byte 95
.byte 95
.byte 59
.byte 49
.byte 59
.byte 49
.byte 59
.byte 59
.space 3, 0x00 # pad
.align 4
.2.3_2_kmpc_loc_struct_pack.12:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.11
.align 4
.2.3_2__kmpc_loc_pack.11:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 77
.byte 65
.byte 73
.byte 78
.byte 95
.byte 95
.byte 59
.byte 49
.byte 56
.byte 59
.byte 49
.byte 56
.byte 59
.byte 59
.section .rodata, "a"
.align 16
.align 8
__NLITPACK_0.0.1:
.long 0x00000002,0x00000000
.align 4
__STRLITPACK_0.0.1:
.byte 48
.byte 1
.byte 1
.byte 0
.byte 0
.data
# -- End MAIN__
.section .rodata, "a"
.space 3, 0x00 # pad
.align 16
.L_2il0floatpacket.19:
.long 0x00000001,0x00000002,0x00000003,0x00000004
.type .L_2il0floatpacket.19,@object
.size .L_2il0floatpacket.19,16
.align 16
.L_2il0floatpacket.20:
.long 0x3f800000,0x3f800000,0x3f800000,0x3f800000
.type .L_2il0floatpacket.20,@object
.size .L_2il0floatpacket.20,16
.data
.section .note.GNU-stack, ""
# End
非openmp代码的ASM输出
.section .text
.LNDBG_TX:
# mark_description "Intel(R) Fortran Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 14.0.2.144 Build 2";
# mark_description "0140120";
# mark_description "-O3 -vec-report3 -xhost -S";
.file "loop_simd.f90"
.text
..TXTST0:
L__routine_start_MAIN___0:
# -- Begin MAIN__
# mark_begin;
.align 16,0x90
.globl MAIN__
MAIN__:
..B1.1: # Preds ..B1.0
..___tag_value_MAIN__.1: #1.9
..LN0:
.file 1 "loop_simd.f90"
.loc 1 1 is_stmt 1
pushq %rbp #1.9
..___tag_value_MAIN__.3: #
..LN1:
movq %rsp, %rbp #1.9
..___tag_value_MAIN__.4: #
..LN2:
andq $-128, %rsp #1.9
..LN3:
subq $128, %rsp #1.9
..LN4:
movq $0x0000117fe, %rsi #1.9
..LN5:
movl $3, %edi #1.9
..LN6:
call __intel_new_feature_proc_init #1.9
..LN7:
# LOE rbx r12 r13 r14 r15
..B1.10: # Preds ..B1.1
..LN8:
vstmxcsr (%rsp) #1.9
..LN9:
movl $__NLITPACK_0.0.1, %edi #1.9
..LN10:
orl $32832, (%rsp) #1.9
..LN11:
vldmxcsr (%rsp) #1.9
..LN12:
call for_set_reentrancy #1.9
..LN13:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.10
..LN14:
.loc 1 6 is_stmt 1
..LN15:
.loc 1 11 is_stmt 1
vmovss .L_2il0floatpacket.0(%rip), %xmm6 #11.28
..LN16:
.loc 1 9 is_stmt 1
xorl %eax, %eax #9.7
..LN17:
.loc 1 6 is_stmt 1
vxorpd %ymm8, %ymm8, %ymm8 #6.7
..LN18:
vmovapd %ymm8, %ymm7 #6.7
..LN19:
vmovapd %ymm8, %ymm0 #6.7
..LN20:
vmovapd %ymm8, %ymm1 #6.7
..LN21:
vmovapd %ymm8, %ymm2 #6.7
..LN22:
vmovapd %ymm8, %ymm3 #6.7
..LN23:
vmovapd %ymm8, %ymm4 #6.7
..LN24:
vmovapd %ymm8, %ymm5 #6.7
..LN25:
# LOE rbx r12 r13 r14 r15 eax xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8
..B1.3: # Preds ..B1.5 ..B1.2
..LN26:
incl %eax #
..LN27:
.loc 1 11 is_stmt 1
vxorps %xmm9, %xmm9, %xmm9 #11.28
..LN28:
vcvtsi2ss %eax, %xmm9, %xmm9 #11.28
..LN29:
vdivss %xmm9, %xmm6, %xmm10 #11.28
..LN30:
vcvtss2sd %xmm10, %xmm10, %xmm10 #11.28
..LN31:
vmovddup %xmm10, %xmm11 #11.28
..LN32:
.loc 1 10 is_stmt 1
xorl %edx, %edx #10.12
..LN33:
.loc 1 11 is_stmt 1
vinsertf128 $1, %xmm11, %ymm11, %ymm9 #11.28
..LN34:
# LOE rbx r12 r13 r14 r15 eax edx xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8 ymm9
..B1.4: # Preds ..B1.4 ..B1.3
..LN35:
.loc 1 10 is_stmt 1
addl $32, %edx #10.12
..LN36:
.loc 1 11 is_stmt 1
vaddpd %ymm9, %ymm8, %ymm8 #11.17
..LN37:
vaddpd %ymm7, %ymm9, %ymm7 #11.17
..LN38:
vaddpd %ymm0, %ymm9, %ymm0 #11.17
..LN39:
vaddpd %ymm1, %ymm9, %ymm1 #11.17
..LN40:
vaddpd %ymm2, %ymm9, %ymm2 #11.17
..LN41:
vaddpd %ymm3, %ymm9, %ymm3 #11.17
..LN42:
vaddpd %ymm4, %ymm9, %ymm4 #11.17
..LN43:
vaddpd %ymm5, %ymm9, %ymm5 #11.17
..LN44:
.loc 1 10 is_stmt 1
cmpl $10000000, %edx #10.12
..LN45:
jb ..B1.4 # Prob 99% #10.12
..LN46:
# LOE rbx r12 r13 r14 r15 eax edx xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8 ymm9
..B1.5: # Preds ..B1.4
..LN47:
.loc 1 9 is_stmt 1
cmpl $10000, %eax #9.7
..LN48:
jb ..B1.3 # Prob 66% #9.7
..LN49:
# LOE rbx r12 r13 r14 r15 eax xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8
..B1.6: # Preds ..B1.5
..LN50:
.loc 1 6 is_stmt 1
vaddpd %ymm7, %ymm8, %ymm6 #6.7
..LN51:
.loc 1 15 is_stmt 1
lea (%rsp), %rdi #15.7
..LN52:
.loc 1 6 is_stmt 1
vaddpd %ymm1, %ymm0, %ymm0 #6.7
..LN53:
vaddpd %ymm3, %ymm2, %ymm1 #6.7
..LN54:
vaddpd %ymm5, %ymm4, %ymm2 #6.7
..LN55:
vaddpd %ymm0, %ymm6, %ymm3 #6.7
..LN56:
vaddpd %ymm2, %ymm1, %ymm4 #6.7
..LN57:
vaddpd %ymm4, %ymm3, %ymm5 #6.7
..LN58:
.loc 1 15 is_stmt 1
movl $-1, %esi #15.7
..LN59:
movq $0x1208384ff00, %rdx #15.7
..LN60:
movl $__STRLITPACK_0.0.1, %ecx #15.7
..LN61:
xorl %eax, %eax #15.7
..LN62:
lea 64(%rsp), %r8 #15.7
..LN63:
movq $0, (%rsp) #15.7
..LN64:
.loc 1 6 is_stmt 1
vextractf128 $1, %ymm5, %xmm7 #6.7
..LN65:
vaddpd %xmm7, %xmm5, %xmm8 #6.7
..LN66:
vunpckhpd %xmm8, %xmm8, %xmm9 #6.7
..LN67:
vaddsd %xmm9, %xmm8, %xmm10 #6.7
..LN68:
.loc 1 15 is_stmt 1
vmovsd %xmm10, 64(%rsp) #15.7
..LN69:
vzeroupper #15.7
..LN70:
call for_write_seq_lis #15.7
..LN71:
# LOE rbx r12 r13 r14 r15
..B1.7: # Preds ..B1.6
..LN72:
.loc 1 18 is_stmt 1
movl $1, %eax #18.1
..LN73:
movq %rbp, %rsp #18.1
..LN74:
popq %rbp #18.1
..___tag_value_MAIN__.6: #
..LN75:
ret #18.1
.align 16,0x90
..___tag_value_MAIN__.8: #
..LN76:
# LOE
..LN77:
# mark_end;
.type MAIN__,@function
.size MAIN__,.-MAIN__
..LNMAIN__.78:
.LNMAIN__:
.section .rodata, "a"
.align 8
.align 8
__NLITPACK_0.0.1:
.long 0x00000000,0x00000000
.align 4
__STRLITPACK_0.0.1:
.byte 48
.byte 1
.byte 1
.byte 0
.byte 0
.data
# -- End MAIN__
.section .rodata, "a"
.space 3, 0x00 # pad
.align 4
.L_2il0floatpacket.0:
.long 0x3f800000
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,4
.data
.section .note.GNU-stack, ""
# End
答案 0 :(得分:4)
使用OpenMP ,Ifort正在使用SIMD对外部循环进行矢量化(超过i
),因此基本上所有时间都花在了
## set up ymm3 with 4 copies of 1.0/(1.0*i),
# and j = %edx = 0
..B1.5: do {
incl %edx # j++
vaddpd %ymm3, %ymm2, %ymm2 # ymm3 + ymm2 => ymm2
cmpl $10000000, %edx } while(j<10000000);
jb ..B1.5 # Prob 99%
vaddpd
的10M迭代将完全支配循环外的所有内容的成本,因此重要的是这个内循环执行10k / 4次。 (注意add $4, %eax
/ cmp $10000, %eax
/ jb
,分支目标返回到内循环之前。)
由于它只使用单个累加器,因此吞吐量受到循环携带依赖性(3个周期)的限制。
它仍在完成全部工作,而不是优化任何循环。
它与#pragma openmp
一样自动矢量化,但使用多个累加器来提高并行度。多个添加指令可以同时进行,而不是让每个指令都依赖于前一个。
内循环的设置非常相似,然后内循环是:
## set up ymm3 with 4 copies of 1.0/(1.0*i),
..B1.4:
addl $32, %edx #10.12
vaddpd %ymm9, %ymm8, %ymm8 # ymm8 + ymm9 => ymm8
vaddpd %ymm7, %ymm9, %ymm7 # ymm7 + ymm9 => ymm7
vaddpd %ymm0, %ymm9, %ymm0 # ymm0 + ymm9 => ymm0
vaddpd %ymm1, %ymm9, %ymm1 # ...
vaddpd %ymm2, %ymm9, %ymm2
vaddpd %ymm3, %ymm9, %ymm3
vaddpd %ymm4, %ymm9, %ymm4
vaddpd %ymm5, %ymm9, %ymm5
cmpl $10000000, %edx
jb ..B1.4 # Prob 99%
# then combine the 8 vector accumulators down to one, and horizontal sum that.
8个累加器可以同时保持8 vaddpd
个飞行,但英特尔SnB / IvB上的延迟仅为3个周期(参见Agner Fog's insn tables)。你没有说出你正在使用的微架构,但我可以从-xhost
使用AVX1但不使用AVX2的事实推断Sandybridge / Ivybridge。 (使用vmovddup
/ vinsertf128
进行广播,而不是AVX2 vbroadcastsd %xmm9, %ymm9
)
这完美地解释了3倍速比:26.535 / 8.940 = 2.97~ = 3 。 (vaddpd
在Skylake之前的Intel CPU上每时钟吞吐量为1,延迟= 3.由于指令级并行性的增加,此版本受吞吐量而非延迟的限制。
展开这么多累加器将有助于Skylake,其中FP add有4个循环延迟,每个循环吞吐量有2个。 (SKL从端口1中删除了低延迟专用向量FP添加单元,并在端口0和1上以改进的4c延迟FMA单元运行它。)
答案 1 :(得分:3)
最好只使用SIMD作为内循环。然后你可以在外循环上使用!$ OMP parallel。 由于I与外循环相关,因此您可能/应该也可以反转外循环和内循环。 如果你为1.0 /(1.0 * i)分配了另一个变量,那么也许可以进行矢量化。然后,缩减假定新变量是堆中的向量,而不是OMP中的私有值。
这些事情通常需要一些时间来解决......
答案 2 :(得分:-1)
SIMD指令旨在提高对矢量或数组进行操作的代码的性能。您的示例代码仅对标量变量进行操作,因此强制向量化不会提高性能并不足为奇!