[问题] 我在O3选项下运行以下代码。然后,我发现使用O3的代码的性能比不使用O3的代码的性能高9倍。
编辑: 我想知道优化技术的关键,而不是原因。这是我的问题。我从未经历过x86组装。因此,很难理解x86汇编代码。这就是我发布此问题的原因。或者,您能为我解释带有O3选项的代码吗? ................................................... .....................
[C代码] 该代码仅执行加法。
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
return 0;
}
[与O3组装]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",@progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, @function
minmax_scale:
.LFB0:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .text.unlikely
.LCOLDB2:
.section .text.startup,"ax",@progbits
.LHOTB2:
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB1:
.cfi_startproc
xorl %eax, %eax
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .text.unlikely
.LCOLDE2:
.section .text.startup
.LHOTE2:
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",@progbits
[无O3的组装]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, @function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.globl main
.type main, @function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2096, %rsp
movl %edi, -2084(%rbp)
movq %rsi, -2096(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L6
.L7:
movl -2068(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2068(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2072(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L6:
cmpl $127, -2072(%rbp)
jle .L7
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L9
call __stack_chk_fail
.L9:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",@progbits
答案 0 :(得分:6)
您的代码没有可观察到的side-effects,因此优化程序只是丢弃了大部分代码。
使用-O3
将您的主要功能变为:
main:
xorl %eax, %eax
ret
等同于:
int main()
{
return 0;
}
这表明微基准测试代码可能很难正确执行。
编辑:
正如下面的评论所指出的,发布的代码未初始化ibuffer[INPUT_FEATURE]
。读取未初始化的变量是未定义的行为,这会使整个程序格式错误。这是一个实际的问题,不需要代码即可产生合理的结果。谢谢@chqrlie
答案 1 :(得分:0)
我修改了代码并进行了尝试,以反映您的答复,如下所示。结果与以前相同。 O3选项总比没有选择要好。
#define OFFSET (8)
#define INPUT_FEATURE (1024)
#define TSIZE (INPUT_FEATURE/OFFSET)
#include<stdio.h>
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = 0.0;
}
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
printf("%lf",H[k]);
}
return 0;
}
[带有O3选项的代码]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",@progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, @function
minmax_scale:
.LFB23:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE23:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .rodata.str1.1,"aMS",@progbits,1
.LC5:
.string "%lf"
.section .text.unlikely
.LCOLDB6:
.section .text.startup,"ax",@progbits
.LHOTB6:
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB24:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $128, %ecx
pxor %xmm12, %xmm12
[无代码编码]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, @function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .rodata
.LC2:
.string "%lf"
.text
.globl main
.type main, @function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2128, %rsp
movl %edi, -2100(%rbp)
movq %rsi, -2112(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2088(%rbp)
movl $0, -2084(%rbp)
jmp .L6
.L7:
movl -2088(%rbp), %eax
cltq
pxor %xmm0, %xmm0
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2084(%rbp)
addl $1, -2088(%rbp)
.L6:
cmpl $127, -2088(%rbp)
jle .L7
movl $0, -2080(%rbp)
movl $0, -2076(%rbp)
jmp .L8
.L9:
movl -2076(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2076(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2080(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2076(%rbp)
addl $1, -2080(%rbp)
.L8:
cmpl $127, -2080(%rbp)
jle .L9
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L10
.L11:
movl -2072(%rbp), %eax
cltq
movq -2064(%rbp,%rax,8), %rax
movq %rax, -2120(%rbp)
movsd -2120(%rbp), %xmm0
movl $.LC2, %edi
movl $1, %eax
call printf
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L10:
cmpl $127, -2072(%rbp)
jle .L11
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L13
call __stack_chk_fail
.L13:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",@progbits