我从微基准测试得到奇怪的结果,可以使用Benchmark tests上的Rust Book章节中的示例bench_xor_1000_ints
重现。
Gotcha: optimizations部分提供了从Bencher::iter
闭包中返回值和/或使用black_box
函数的一般建议。我最终得到了这五个变种:
#![feature(test)]
extern crate test;
use test::{Bencher, black_box};
fn xor(x: i32, y: i32) -> i32 { x ^ y }
#[bench]
fn xor_closure_a(b: &mut Bencher) {
b.iter(|| {
let n = black_box(1000);
(0..n).fold(0, |a, b| a ^ b)
});
}
#[bench]
fn xor_closure_b(b: &mut Bencher) {
b.iter(|| {
let n = black_box(1000);
(0..n).fold(0, |a, b| a ^ b);
});
}
#[bench]
fn xor_pointer_a(b: &mut Bencher) {
b.iter(|| {
let n = black_box(1000);
(0..n).fold(0, xor)
});
}
#[bench]
fn xor_pointer_b(b: &mut Bencher) {
b.iter(|| {
let n = black_box(1000);
(0..n).fold(0, xor);
});
}
#[bench] // closure/pointer or semicolon doesn't affect results
fn xor_black_box(b: &mut Bencher) {
b.iter(|| {
black_box((0..1000).fold(0, xor));
});
}
|a, b| a ^ b
xor
b.iter
闭包返回值(注意缺少分号)b.iter
闭包不返回值(以分号结尾)black_box
并希望获得最佳test tests::xor_black_box ... bench: 69 ns/iter (+/- 1)
test tests::xor_closure_a ... bench: 70 ns/iter (+/- 1)
test tests::xor_closure_b ... bench: 921 ns/iter (+/- 5)
test tests::xor_pointer_a ... bench: 60 ns/iter (+/- 1)
test tests::xor_pointer_b ... bench: 0 ns/iter (+/- 0)
关于测量的一些观察:
let n = black_box(1000)
并将其内联到范围(0..1000).fold ...
不会影响任何结果n
的缩放比例适当地缩放测量值(优化为0的xor_pointer_b
除外)总体而言,结果与Rust Book的一般建议一致,但xor_closure_b
基准除外。
更具体地说,带有后缀 _a (返回值)的结果似乎与黑盒子 xor_black_box 一致,这对我来说听起来不错。优化为0的xor_pointer_b
似乎是合法的,因为它的闭包不会返回值。但xor_closure_b
的结果很奇怪。
汇编代码可能会澄清这个问题。 How can I prevent the Rust benchmark library from optimizing away my code?给出了从Rust基准测试中读取装配的一个很好的概述。
cargo rustc --release -- --emit asm
生成this output:
.text
.file "fpinrust.cgu-0.rs"
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI0_0:
.long 0
.long 1
.long 2
.long 3
.LCPI0_1:
.long 4
.long 4
.long 4
.long 4
.LCPI0_2:
.long 8
.long 8
.long 8
.long 8
.LCPI0_3:
.long 12
.long 12
.long 12
.long 12
.LCPI0_4:
.long 16
.long 16
.long 16
.long 16
.LCPI0_5:
.long 20
.long 20
.long 20
.long 20
.LCPI0_6:
.long 24
.long 24
.long 24
.long 24
.LCPI0_7:
.long 28
.long 28
.long 28
.long 28
.LCPI0_8:
.long 32
.long 32
.long 32
.long 32
.section .text._ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E,@function
_ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E:
.cfi_startproc
pushq %r14
.Ltmp0:
.cfi_def_cfa_offset 16
pushq %rbx
.Ltmp1:
.cfi_def_cfa_offset 24
subq $40, %rsp
.Ltmp2:
.cfi_def_cfa_offset 64
.Ltmp3:
.cfi_offset %rbx, -24
.Ltmp4:
.cfi_offset %r14, -16
movq %rdi, %r14
leaq 24(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%r14), %r9
testq %r9, %r9
je .LBB0_15
xorl %ecx, %ecx
leaq 8(%rsp), %r8
movdqa .LCPI0_0(%rip), %xmm8
movdqa .LCPI0_1(%rip), %xmm15
movdqa .LCPI0_2(%rip), %xmm2
movdqa .LCPI0_3(%rip), %xmm9
movdqa .LCPI0_4(%rip), %xmm10
movdqa .LCPI0_5(%rip), %xmm11
movdqa .LCPI0_6(%rip), %xmm12
movdqa .LCPI0_7(%rip), %xmm13
movdqa .LCPI0_8(%rip), %xmm14
.p2align 4, 0x90
.LBB0_2:
incq %rcx
movl $1000, 8(%rsp)
#APP
#NO_APP
movl 8(%rsp), %esi
testl %esi, %esi
movl $0, %edx
jle .LBB0_14
xorl %edx, %edx
cmpl $8, %esi
jae .LBB0_5
xorl %edi, %edi
jmp .LBB0_13
.p2align 4, 0x90
.LBB0_5:
movl %esi, %eax
andl $-8, %eax
movl $0, %edi
je .LBB0_13
leal -8(%rax), %edx
movl %edx, %edi
shrl $3, %edi
leal 1(%rdi), %ebx
andl $3, %ebx
pxor %xmm3, %xmm3
cmpl $24, %edx
pxor %xmm4, %xmm4
movdqa %xmm8, %xmm5
jb .LBB0_9
leal -1(%rbx), %edx
subl %edi, %edx
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
movdqa %xmm8, %xmm5
.p2align 4, 0x90
.LBB0_8:
movdqa %xmm5, %xmm6
paddd %xmm15, %xmm6
movdqa %xmm5, %xmm7
paddd %xmm2, %xmm7
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6
movdqa %xmm5, %xmm4
paddd %xmm9, %xmm4
movdqa %xmm5, %xmm0
paddd %xmm10, %xmm0
pxor %xmm7, %xmm3
movdqa %xmm5, %xmm7
paddd %xmm11, %xmm7
movdqa %xmm5, %xmm1
paddd %xmm12, %xmm1
pxor %xmm4, %xmm7
pxor %xmm6, %xmm7
movdqa %xmm5, %xmm4
paddd %xmm13, %xmm4
paddd %xmm14, %xmm5
pxor %xmm0, %xmm1
pxor %xmm1, %xmm3
pxor %xmm7, %xmm4
addl $4, %edx
jne .LBB0_8
.LBB0_9:
testl %ebx, %ebx
je .LBB0_12
negl %ebx
.p2align 4, 0x90
.LBB0_11:
movdqa %xmm5, %xmm0
pxor %xmm5, %xmm3
paddd %xmm15, %xmm5
paddd %xmm2, %xmm0
pxor %xmm5, %xmm4
incl %ebx
movdqa %xmm0, %xmm5
jne .LBB0_11
.LBB0_12:
pxor %xmm4, %xmm3
pshufd $78, %xmm3, %xmm0
pxor %xmm3, %xmm0
pshufd $229, %xmm0, %xmm1
pxor %xmm0, %xmm1
movd %xmm1, %edx
cmpl %eax, %esi
movl %eax, %edi
je .LBB0_14
.p2align 4, 0x90
.LBB0_13:
xorl %edi, %edx
leal 1(%rdi), %eax
cmpl %eax, %esi
movl %eax, %edi
jne .LBB0_13
.LBB0_14:
movl %edx, 8(%rsp)
#APP
#NO_APP
cmpq %r9, %rcx
jne .LBB0_2
.LBB0_15:
leaq 8(%rsp), %rdi
leaq 24(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq 8(%rsp), %rax
movl 16(%rsp), %ecx
movq %rax, 8(%r14)
movl %ecx, 16(%r14)
addq $40, %rsp
popq %rbx
popq %r14
retq
.Lfunc_end0:
.size _ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E, .Lfunc_end0-_ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E
.cfi_endproc
.section .text._ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E,@function
_ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E:
.cfi_startproc
pushq %r14
.Ltmp5:
.cfi_def_cfa_offset 16
pushq %rbx
.Ltmp6:
.cfi_def_cfa_offset 24
subq $40, %rsp
.Ltmp7:
.cfi_def_cfa_offset 64
.Ltmp8:
.cfi_offset %rbx, -24
.Ltmp9:
.cfi_offset %r14, -16
movq %rdi, %r14
leaq 24(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%r14), %rax
testq %rax, %rax
je .LBB1_5
xorl %ecx, %ecx
leaq 8(%rsp), %rdx
.p2align 4, 0x90
.LBB1_2:
movl $1000, 8(%rsp)
#APP
#NO_APP
movl 8(%rsp), %esi
xorl %ebx, %ebx
.p2align 4, 0x90
.LBB1_3:
xorl %edi, %edi
cmpl %esi, %ebx
setl %dil
addl %ebx, %edi
cmpl %esi, %ebx
movl %edi, %ebx
jl .LBB1_3
incq %rcx
#APP
#NO_APP
cmpq %rax, %rcx
jne .LBB1_2
.LBB1_5:
leaq 8(%rsp), %rdi
leaq 24(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq 8(%rsp), %rax
movl 16(%rsp), %ecx
movq %rax, 8(%r14)
movl %ecx, 16(%r14)
addq $40, %rsp
popq %rbx
popq %r14
retq
.Lfunc_end1:
.size _ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E, .Lfunc_end1-_ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI2_0:
.long 0
.long 1
.long 2
.long 3
.LCPI2_1:
.long 4
.long 4
.long 4
.long 4
.LCPI2_2:
.long 8
.long 8
.long 8
.long 8
.LCPI2_3:
.long 12
.long 12
.long 12
.long 12
.LCPI2_4:
.long 16
.long 16
.long 16
.long 16
.LCPI2_5:
.long 20
.long 20
.long 20
.long 20
.LCPI2_6:
.long 24
.long 24
.long 24
.long 24
.LCPI2_7:
.long 28
.long 28
.long 28
.long 28
.LCPI2_8:
.long 32
.long 32
.long 32
.long 32
.section .text._ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E,@function
_ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E:
.cfi_startproc
pushq %r14
.Ltmp10:
.cfi_def_cfa_offset 16
pushq %rbx
.Ltmp11:
.cfi_def_cfa_offset 24
subq $40, %rsp
.Ltmp12:
.cfi_def_cfa_offset 64
.Ltmp13:
.cfi_offset %rbx, -24
.Ltmp14:
.cfi_offset %r14, -16
movq %rdi, %r14
leaq 24(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%r14), %r9
testq %r9, %r9
je .LBB2_15
xorl %ecx, %ecx
leaq 8(%rsp), %r8
movdqa .LCPI2_0(%rip), %xmm8
movdqa .LCPI2_1(%rip), %xmm15
movdqa .LCPI2_2(%rip), %xmm2
movdqa .LCPI2_3(%rip), %xmm9
movdqa .LCPI2_4(%rip), %xmm10
movdqa .LCPI2_5(%rip), %xmm11
movdqa .LCPI2_6(%rip), %xmm12
movdqa .LCPI2_7(%rip), %xmm13
movdqa .LCPI2_8(%rip), %xmm14
.p2align 4, 0x90
.LBB2_2:
incq %rcx
movl $1000, 8(%rsp)
#APP
#NO_APP
movl 8(%rsp), %esi
testl %esi, %esi
movl $0, %edx
jle .LBB2_14
xorl %edx, %edx
cmpl $8, %esi
jae .LBB2_5
xorl %edi, %edi
jmp .LBB2_13
.p2align 4, 0x90
.LBB2_5:
movl %esi, %eax
andl $-8, %eax
movl $0, %edi
je .LBB2_13
leal -8(%rax), %edx
movl %edx, %edi
shrl $3, %edi
leal 1(%rdi), %ebx
andl $3, %ebx
pxor %xmm3, %xmm3
cmpl $24, %edx
pxor %xmm4, %xmm4
movdqa %xmm8, %xmm5
jb .LBB2_9
leal -1(%rbx), %edx
subl %edi, %edx
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
movdqa %xmm8, %xmm5
.p2align 4, 0x90
.LBB2_8:
movdqa %xmm5, %xmm6
paddd %xmm15, %xmm6
movdqa %xmm5, %xmm7
paddd %xmm2, %xmm7
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6
movdqa %xmm5, %xmm4
paddd %xmm9, %xmm4
movdqa %xmm5, %xmm0
paddd %xmm10, %xmm0
pxor %xmm7, %xmm3
movdqa %xmm5, %xmm7
paddd %xmm11, %xmm7
movdqa %xmm5, %xmm1
paddd %xmm12, %xmm1
pxor %xmm4, %xmm7
pxor %xmm6, %xmm7
movdqa %xmm5, %xmm4
paddd %xmm13, %xmm4
paddd %xmm14, %xmm5
pxor %xmm0, %xmm1
pxor %xmm1, %xmm3
pxor %xmm7, %xmm4
addl $4, %edx
jne .LBB2_8
.LBB2_9:
testl %ebx, %ebx
je .LBB2_12
negl %ebx
.p2align 4, 0x90
.LBB2_11:
movdqa %xmm5, %xmm0
pxor %xmm5, %xmm3
paddd %xmm15, %xmm5
paddd %xmm2, %xmm0
pxor %xmm5, %xmm4
incl %ebx
movdqa %xmm0, %xmm5
jne .LBB2_11
.LBB2_12:
pxor %xmm4, %xmm3
pshufd $78, %xmm3, %xmm0
pxor %xmm3, %xmm0
pshufd $229, %xmm0, %xmm1
pxor %xmm0, %xmm1
movd %xmm1, %edx
cmpl %eax, %esi
movl %eax, %edi
je .LBB2_14
.p2align 4, 0x90
.LBB2_13:
xorl %edi, %edx
leal 1(%rdi), %eax
cmpl %eax, %esi
movl %eax, %edi
jne .LBB2_13
.LBB2_14:
movl %edx, 8(%rsp)
#APP
#NO_APP
cmpq %r9, %rcx
jne .LBB2_2
.LBB2_15:
leaq 8(%rsp), %rdi
leaq 24(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq 8(%rsp), %rax
movl 16(%rsp), %ecx
movq %rax, 8(%r14)
movl %ecx, 16(%r14)
addq $40, %rsp
popq %rbx
popq %r14
retq
.Lfunc_end2:
.size _ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E, .Lfunc_end2-_ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E
.cfi_endproc
.section .text._ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E,@function
_ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E:
.cfi_startproc
pushq %rbx
.Ltmp15:
.cfi_def_cfa_offset 16
subq $32, %rsp
.Ltmp16:
.cfi_def_cfa_offset 48
.Ltmp17:
.cfi_offset %rbx, -16
movq %rdi, %rbx
leaq 16(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%rbx), %rax
testq %rax, %rax
je .LBB3_3
leaq (%rsp), %rcx
.p2align 4, 0x90
.LBB3_2:
movl $1000, (%rsp)
#APP
#NO_APP
#APP
#NO_APP
decq %rax
jne .LBB3_2
.LBB3_3:
leaq (%rsp), %rdi
leaq 16(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq (%rsp), %rax
movl 8(%rsp), %ecx
movq %rax, 8(%rbx)
movl %ecx, 16(%rbx)
addq $32, %rsp
popq %rbx
retq
.Lfunc_end3:
.size _ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E, .Lfunc_end3-_ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI4_0:
.long 0
.long 1
.long 2
.long 3
.LCPI4_1:
.long 4
.long 4
.long 4
.long 4
.LCPI4_2:
.long 8
.long 8
.long 8
.long 8
.LCPI4_3:
.long 12
.long 12
.long 12
.long 12
.LCPI4_4:
.long 16
.long 16
.long 16
.long 16
.LCPI4_5:
.long 20
.long 20
.long 20
.long 20
.LCPI4_6:
.long 24
.long 24
.long 24
.long 24
.LCPI4_7:
.long 28
.long 28
.long 28
.long 28
.LCPI4_8:
.long 32
.long 32
.long 32
.long 32
.LCPI4_9:
.long 36
.long 36
.long 36
.long 36
.LCPI4_10:
.long 40
.long 40
.long 40
.long 40
.section .text._ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E,@function
_ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E:
.cfi_startproc
pushq %rbx
.Ltmp18:
.cfi_def_cfa_offset 16
subq $32, %rsp
.Ltmp19:
.cfi_def_cfa_offset 48
.Ltmp20:
.cfi_offset %rbx, -16
movq %rdi, %rbx
leaq 16(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%rbx), %rax
testq %rax, %rax
je .LBB4_5
xorl %ecx, %ecx
movdqa .LCPI4_1(%rip), %xmm9
movdqa .LCPI4_2(%rip), %xmm10
movdqa .LCPI4_3(%rip), %xmm11
movdqa .LCPI4_4(%rip), %xmm12
movdqa .LCPI4_5(%rip), %xmm13
movdqa .LCPI4_6(%rip), %xmm14
movdqa .LCPI4_7(%rip), %xmm15
movdqa .LCPI4_8(%rip), %xmm0
movdqa .LCPI4_9(%rip), %xmm1
movdqa .LCPI4_10(%rip), %xmm2
leaq (%rsp), %rdx
.p2align 4, 0x90
.LBB4_2:
pxor %xmm3, %xmm3
movl $1000, %esi
pxor %xmm4, %xmm4
movdqa .LCPI4_0(%rip), %xmm5
.p2align 4, 0x90
.LBB4_3:
movdqa %xmm5, %xmm6
paddd %xmm9, %xmm6
movdqa %xmm5, %xmm7
paddd %xmm10, %xmm7
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6
movdqa %xmm5, %xmm4
paddd %xmm11, %xmm4
pxor %xmm7, %xmm3
movdqa %xmm5, %xmm7
paddd %xmm13, %xmm7
pxor %xmm4, %xmm7
movdqa %xmm5, %xmm4
paddd %xmm12, %xmm4
pxor %xmm6, %xmm7
movdqa %xmm5, %xmm6
paddd %xmm14, %xmm6
pxor %xmm4, %xmm6
movdqa %xmm5, %xmm8
paddd %xmm15, %xmm8
pxor %xmm6, %xmm3
movdqa %xmm5, %xmm4
paddd %xmm0, %xmm4
pxor %xmm4, %xmm3
movdqa %xmm5, %xmm4
paddd %xmm1, %xmm4
pxor %xmm8, %xmm4
pxor %xmm7, %xmm4
paddd %xmm2, %xmm5
addl $-40, %esi
jne .LBB4_3
pxor %xmm3, %xmm4
pshufd $78, %xmm4, %xmm3
pxor %xmm4, %xmm3
pshufd $229, %xmm3, %xmm4
pxor %xmm3, %xmm4
incq %rcx
movd %xmm4, (%rsp)
#APP
#NO_APP
#APP
#NO_APP
cmpq %rax, %rcx
jne .LBB4_2
.LBB4_5:
leaq (%rsp), %rdi
leaq 16(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq (%rsp), %rax
movl 8(%rsp), %ecx
movq %rax, 8(%rbx)
movl %ecx, 16(%rbx)
addq $32, %rsp
popq %rbx
retq
.Lfunc_end4:
.size _ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E, .Lfunc_end4-_ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E
.cfi_endproc
.section ".note.GNU-stack","",@progbits
查看具有相对相同性能的程序集xor_closure_a
,xor_pointer_a
和xor_black_box
,评估类似的汇编代码。顺便说一句,这比指导穷人xor_closure_b
的ASM数量要多得多。
这是我的研究结束的地方。如果有人解释,为什么xor_black_box
产生的这种组装毯子的评估速度比xor_closure_b
快,我会很高兴。或者换句话说,为什么应该由编译器优化的版本运行速度明显变慢,以及我应该信任哪个基准变体?
rustc --version
rustc 1.13.0-nightly (378195665 2016-09-08)
cargo --version
cargo 0.13.0-nightly (afaffa1 2016-09-06)
@FrancisGagné在他的回答中调查这个问题做得非常出色。我将重现其结果的步骤如下所示。
编译基准并发出汇编:
cargo rustc --release -- --test --emit asm
运行生成的可执行文件以验证以前的行为:
./target/release/deps/xor --bench
running 2 tests
test tests::xor_closure_b ... bench: 925 ns/iter (+/- 9)
test tests::xor_pointer_b ... bench: 0 ns/iter (+/- 0)
test result: ok. 0 passed; 0 failed; 0 ignored; 2 measured
编译修补的程序集输出./target/release/deps/xor.s
export RUSTLIB=/path/to/lib/rustlib
gcc target/release/deps/xor.s $RUSTLIB/x86_64-unknown-linux-gnu/lib/*.rlib $RUSTLIB/x86_64-unknown-linux-gnu/lib/*.so -pthread -lpthread -lm -ldl
运行更新的基准测试:
./a.out --bench
running 2 tests
test tests::xor_closure_b ... bench: 1 ns/iter (+/- 0)
test tests::xor_pointer_b ... bench: 0 ns/iter (+/- 0)
test result: ok. 0 passed; 0 failed; 0 ignored; 2 measured
答案 0 :(得分:4)
我会专注于比较xor_closure_b
和xor_pointer_b
,因为它们应该有相似的表现(即它们都应该什么都不做)。
更新:正如@EOF所指出的,我在初步分析中犯了一个错误,因此我修改了以下内容。
首先,让我们看一下为这两个函数生成的LLVM IR。 (我发现LLVM IR比ASM更容易阅读,因为它更结构化。)
; Function Attrs: uwtable
define internal void @_ZN3xor13xor_closure_b17hb13913a8d2a27b06E(%"11.test::Bencher"* nocapture dereferenceable(32)) unnamed_addr #0 personality i32 (i32, i32, i64, %"8.unwind::libunwind::_Unwind_Exception"*, %"8.unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
entry-block:
%dummy.i.i = alloca {}, align 8
%dummy.i.i.i = alloca i32, align 4
%start1.i = alloca %"1.std::time::Instant", align 8
%tmp_ret2.i = alloca %"1.std::time::Duration", align 8
%1 = bitcast %"1.std::time::Duration"* %tmp_ret2.i to i8*
call void @llvm.lifetime.start(i64 16, i8* %1)
%2 = bitcast %"1.std::time::Instant"* %start1.i to i8*
call void @llvm.lifetime.start(i64 16, i8* %2)
call void @_ZN3std4time7Instant3now17h37bccd496c61083dE(%"1.std::time::Instant"* noalias nocapture nonnull sret dereferenceable(16) %start1.i)
%3 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 0
%4 = load i64, i64* %3, align 8
%5 = icmp eq i64 %4, 0
br i1 %5, label %_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit, label %bb7.lr.ph.i
bb7.lr.ph.i: ; preds = %entry-block
%6 = bitcast i32* %dummy.i.i.i to i8*
%7 = bitcast {}* %dummy.i.i to i8*
br label %bb7.i
bb7.i: ; preds = %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i", %bb7.lr.ph.i
%iter.sroa.0.019.i = phi i64 [ 0, %bb7.lr.ph.i ], [ %11, %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i" ]
call void @llvm.lifetime.start(i64 4, i8* %6) #2
store i32 1000, i32* %dummy.i.i.i, align 4
call void asm "", "r,~{dirflag},~{fpsr},~{flags}"(i32* nonnull %dummy.i.i.i) #2, !srcloc !1
%8 = load i32, i32* %dummy.i.i.i, align 4
call void @llvm.lifetime.end(i64 4, i8* %6) #2
br label %bb7.i.i.i
bb7.i.i.i: ; preds = %bb7.i.i.i, %bb7.i
%iter.sroa.0.0.i.i.i = phi i32 [ 0, %bb7.i ], [ %iter.sroa.0.1.i.i.i, %bb7.i.i.i ]
%9 = icmp slt i32 %iter.sroa.0.0.i.i.i, %8
%10 = zext i1 %9 to i32
%iter.sroa.0.1.i.i.i = add i32 %10, %iter.sroa.0.0.i.i.i
br i1 %9, label %bb7.i.i.i, label %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i"
"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i": ; preds = %bb7.i.i.i
%11 = add nuw i64 %iter.sroa.0.019.i, 1
call void @llvm.lifetime.start(i64 0, i8* %7)
call void asm "", "r,~{dirflag},~{fpsr},~{flags}"({}* nonnull %dummy.i.i) #2, !srcloc !1
call void @llvm.lifetime.end(i64 0, i8* %7)
%exitcond.i = icmp eq i64 %11, %4
br i1 %exitcond.i, label %_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit.loopexit, label %bb7.i
_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit.loopexit: ; preds = %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i"
br label %_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit
_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit: ; preds = %_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit.loopexit, %entry-block
call void @_ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE(%"1.std::time::Duration"* noalias nocapture nonnull sret dereferenceable(16) %tmp_ret2.i, %"1.std::time::Instant"* noalias nonnull readonly dereferenceable(16) %start1.i)
%12 = getelementptr inbounds %"1.std::time::Duration", %"1.std::time::Duration"* %tmp_ret2.i, i64 0, i32 0
%13 = getelementptr inbounds %"1.std::time::Duration", %"1.std::time::Duration"* %tmp_ret2.i, i64 0, i32 1
%14 = load i64, i64* %12, align 8
%15 = load i32, i32* %13, align 8
%16 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 1, i32 0
store i64 %14, i64* %16, align 8
%17 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 1, i32 1
store i32 %15, i32* %17, align 4
call void @llvm.lifetime.end(i64 16, i8* %2)
call void @llvm.lifetime.end(i64 16, i8* %1)
ret void
}
; Function Attrs: uwtable
define internal void @_ZN3xor13xor_pointer_b17h7ba0f9760d9fd9f8E(%"11.test::Bencher"* nocapture dereferenceable(32)) unnamed_addr #0 personality i32 (i32, i32, i64, %"8.unwind::libunwind::_Unwind_Exception"*, %"8.unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
entry-block:
%dummy.i.i = alloca {}, align 8
%dummy.i.i.i = alloca i32, align 4
%start1.i = alloca %"1.std::time::Instant", align 8
%tmp_ret2.i = alloca %"1.std::time::Duration", align 8
%1 = bitcast %"1.std::time::Duration"* %tmp_ret2.i to i8*
call void @llvm.lifetime.start(i64 16, i8* %1)
%2 = bitcast %"1.std::time::Instant"* %start1.i to i8*
call void @llvm.lifetime.start(i64 16, i8* %2)
call void @_ZN3std4time7Instant3now17h37bccd496c61083dE(%"1.std::time::Instant"* noalias nocapture nonnull sret dereferenceable(16) %start1.i)
%3 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 0
%4 = load i64, i64* %3, align 8
%5 = icmp eq i64 %4, 0
br i1 %5, label %_ZN4test7Bencher4iter17hae343b1316e5897bE.exit, label %bb7.lr.ph.i
bb7.lr.ph.i: ; preds = %entry-block
%6 = bitcast i32* %dummy.i.i.i to i8*
%7 = bitcast {}* %dummy.i.i to i8*
br label %bb7.i
bb7.i: ; preds = %bb7.i, %bb7.lr.ph.i
%iter.sroa.0.019.i = phi i64 [ 0, %bb7.lr.ph.i ], [ %8, %bb7.i ]
%8 = add nuw i64 %iter.sroa.0.019.i, 1
call void @llvm.lifetime.start(i64 4, i8* %6) #2
store i32 1000, i32* %dummy.i.i.i, align 4
call void asm "", "r,~{dirflag},~{fpsr},~{flags}"(i32* nonnull %dummy.i.i.i) #2, !srcloc !1
call void @llvm.lifetime.end(i64 4, i8* %6) #2
call void @llvm.lifetime.start(i64 0, i8* %7)
call void asm "", "r,~{dirflag},~{fpsr},~{flags}"({}* nonnull %dummy.i.i) #2, !srcloc !1
call void @llvm.lifetime.end(i64 0, i8* %7)
%exitcond.i = icmp eq i64 %8, %4
br i1 %exitcond.i, label %_ZN4test7Bencher4iter17hae343b1316e5897bE.exit.loopexit, label %bb7.i
_ZN4test7Bencher4iter17hae343b1316e5897bE.exit.loopexit: ; preds = %bb7.i
br label %_ZN4test7Bencher4iter17hae343b1316e5897bE.exit
_ZN4test7Bencher4iter17hae343b1316e5897bE.exit: ; preds = %_ZN4test7Bencher4iter17hae343b1316e5897bE.exit.loopexit, %entry-block
call void @_ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE(%"1.std::time::Duration"* noalias nocapture nonnull sret dereferenceable(16) %tmp_ret2.i, %"1.std::time::Instant"* noalias nonnull readonly dereferenceable(16) %start1.i)
%9 = getelementptr inbounds %"1.std::time::Duration", %"1.std::time::Duration"* %tmp_ret2.i, i64 0, i32 0
%10 = getelementptr inbounds %"1.std::time::Duration", %"1.std::time::Duration"* %tmp_ret2.i, i64 0, i32 1
%11 = load i64, i64* %9, align 8
%12 = load i32, i32* %10, align 8
%13 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 1, i32 0
store i64 %11, i64* %13, align 8
%14 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 1, i32 1
store i32 %12, i32* %14, align 4
call void @llvm.lifetime.end(i64 16, i8* %2)
call void @llvm.lifetime.end(i64 16, i8* %1)
ret void
}
如果我们比较xor_closure_b
和xor_pointer_b
的LLVM IR,它们看起来非常相似。但是,有一个区别是:bb7.i.i.i
块已在xor_pointer_b
中优化,但由于某些原因未在xor_closure_b
中优化。这是块:
bb7.i.i.i: ; preds = %bb7.i.i.i, %bb7.i
%iter.sroa.0.0.i.i.i = phi i32 [ 0, %bb7.i ], [ %iter.sroa.0.1.i.i.i, %bb7.i.i.i ]
%9 = icmp slt i32 %iter.sroa.0.0.i.i.i, %8
%10 = zext i1 %9 to i32
%iter.sroa.0.1.i.i.i = add i32 %10, %iter.sroa.0.0.i.i.i
br i1 %9, label %bb7.i.i.i, label %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i"
在这里它被翻译成ASM:
.LBB1_3:
xorl %edi, %edi
cmpl %esi, %ebx
setl %dil
addl %ebx, %edi
cmpl %esi, %ebx
movl %edi, %ebx
jl .LBB1_3
这是一种从0到1000循环的非常愚蠢的方法。将上面的代码更改为:
.LBB1_3:
cmpl %esi, %ebx
jge .LBB1_3a
incl %ebx
cmpl %esi, %ebx
jl .LBB1_3
.LBB1_3a:
使xor_closure_b
的基准在我的计算机上从781 ns/iter (+/- 19)
下降到270 ns/iter (+/- 7)
。
我不能肯定地说为什么编译器生成的代码是如此之慢,或者为什么它首先没有被优化(就像在xor_pointer_b
中那样)......但是,它似乎xor_pointer_a
和xor_closure_a
更快,因为生成的代码是矢量化的,这会导致循环执行更少的迭代(即循环展开),因子为32(例如{{ 1}},.LBB0_8
中的主循环,执行31次迭代,然后在循环之后处理其余部分。)
作为参考,我用这个命令行编译了编辑过的ASM:
xor_closure_a
我用$ gcc target/release/xor-71758a2519026d86.s ~/.multirust/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib/lib{test,term,getopts,rustc_unicode,std,libc,rand,collections,alloc_system,alloc,core,panic_unwind}-411f48d3.rlib -pthread -lpthread -lm -ldl
运行它。另外,我的CPU是Intel Core i7-4770K。