我正在学习x64编程以及英特尔C ++编译器和GCC之间的差异以及它们如何优化指令
问题:
告诉英特尔编译器转储汇编代码(类似于gcc -S)的最佳方法是什么? 现在我在Visual Studio中调试和反汇编以查看说明。
反汇编的英特尔编译psum1不遵守传递寄存器rdi,rsi,rdx,rcx,r8,r9的参数约定,就像在GCC汇编器输出中看到的那样。我在这里想念的是什么?
出于某种原因,英特尔编译器没有优化内存访问,我需要更改哪些设置?
//intel compiler /Ox output
p[i] = p[i-1] + a[i];
000000013F79118B movss xmm1,dword ptr [rcx+rax*4+8]
000000013F791191 addss xmm0,dword ptr [rcx+rax*4+4]
000000013F791197 movss dword ptr [rdx+rax*4+4],xmm0
000000013F79119D addss xmm0,xmm1
000000013F7911A1 movss dword ptr [rdx+rax*4+8],xmm0
//GCC -O3 ouput
LBB1_3:
decq %rdx
LBB1_2:
addq $4, %rsi
addq $4, %rdi
addss (%rdi), %xmm0
movss %xmm0, (%rsi)
testq %rdx, %rdx
jne LBB1_3
LBB1_4:
void psum1( float a[], float p[], long int n ) {
long int i;
p[0] = a[0];
for (i=1; i<n; i++) {
p[i] = p[i-1] + a[i];
}
}
赞成速度/ Ot
void psum1(float a [],float p [],long int n){ long int i;
p[0] = a[0];
000000013F791156 movss xmm0,dword ptr [rcx]
000000013F79115A mov dword ptr [rdx],eax
for( i=1; i<n; i++ ) {
000000013F79115C jle psum1+7Ah (13F7911CAh)
000000013F79115E mov eax,1
000000013F791163 lea r10d,[r8-1]
000000013F791167 mov r11d,r10d
000000013F79116A xor r9d,r9d
000000013F79116D shr r11d,1Fh
000000013F791171 lea r8d,[r11+r8-1]
000000013F791176 sar r8d,1
000000013F791179 test r8d,r8d
000000013F79117C jbe psum1+5Eh (13F7911AEh)
p[i] = p[i-1] + a[i];
000000013F79117E lea eax,[r9+r9]
for( i=1; i<n; i++ ) {
000000013F791182 inc r9d
p[i] = p[i-1] + a[i];
000000013F791185 movsxd rax,eax
for( i=1; i<n; i++ ) {
000000013F791188 cmp r9d,r8d
p[i] = p[i-1] + a[i];
000000013F79118B movss xmm1,dword ptr [rcx+rax*4+8]
000000013F791191 addss xmm0,dword ptr [rcx+rax*4+4]
000000013F791197 movss dword ptr [rdx+rax*4+4],xmm0
000000013F79119D addss xmm0,xmm1
000000013F7911A1 movss dword ptr [rdx+rax*4+8],xmm0
for( i=1; i<n; i++ ) {
000000013F7911A7 jb psum1+2Eh (13F79117Eh)
000000013F7911A9 lea eax,[r9+r9+1]
000000013F7911AE lea r8d,[rax-1]
000000013F7911B2 cmp r10d,r8d
000000013F7911B5 jbe psum1+7Ah (13F7911CAh)
p[i] = p[i-1] + a[i];
000000013F7911B7 movsxd rax,eax
000000013F7911BA movss xmm0,dword ptr [rdx+rax*4-4]
000000013F7911C0 addss xmm0,dword ptr [rcx+rax*4]
000000013F7911C5 movss dword ptr [rdx+rax*4],xmm0
}
}
000000013F7911CA ret
000000013F7911CB nop dword ptr [rax+rax]
.section __TEXT,__text,regular,pure_instructions
.globl _psum1
.align 4, 0x90
_psum1:
Leh_func_begin1:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movss (%rdi), %xmm0
movss %xmm0, (%rsi)
cmpq $2, %rdx
jl LBB1_4
addq $-2, %rdx
jmp LBB1_2
.align 4, 0x90
LBB1_3:
decq %rdx
LBB1_2:
addq $4, %rsi
addq $4, %rdi
addss (%rdi), %xmm0
movss %xmm0, (%rsi)
testq %rdx, %rdx
jne LBB1_3
LBB1_4:
popq %rbp
ret
Leh_func_end1: