我从微基准测试得到奇怪的结果,可以使用Benchmark tests上的Rust Book章节中的示例bench_xor_1000_ints
Gotcha: optimizations部分提供了从Bencher::iter
extern crate test;
use test::{Bencher, black_box};
fn xor(x: i32, y: i32) -> i32 { x ^ y }
fn xor_closure_a(b: &mut Bencher) {
b.iter(|| {
let n = black_box(1000);
(0..n).fold(0, |a, b| a ^ b)
fn xor_closure_b(b: &mut Bencher) {
b.iter(|| {
let n = black_box(1000);
(0..n).fold(0, |a, b| a ^ b);
fn xor_pointer_a(b: &mut Bencher) {
b.iter(|| {
let n = black_box(1000);
(0..n).fold(0, xor)
fn xor_pointer_b(b: &mut Bencher) {
b.iter(|| {
let n = black_box(1000);
(0..n).fold(0, xor);
#[bench] // closure/pointer or semicolon doesn't affect results
fn xor_black_box(b: &mut Bencher) {
b.iter(|| {
black_box((0..1000).fold(0, xor));
|a, b| a ^ b
并希望获得最佳test tests::xor_black_box ... bench: 69 ns/iter (+/- 1)
test tests::xor_closure_a ... bench: 70 ns/iter (+/- 1)
test tests::xor_closure_b ... bench: 921 ns/iter (+/- 5)
test tests::xor_pointer_a ... bench: 60 ns/iter (+/- 1)
test tests::xor_pointer_b ... bench: 0 ns/iter (+/- 0)
let n = black_box(1000)
并将其内联到范围(0..1000).fold ...
除外)总体而言,结果与Rust Book的一般建议一致,但xor_closure_b
更具体地说,带有后缀 _a (返回值)的结果似乎与黑盒子 xor_black_box 一致,这对我来说听起来不错。优化为0的xor_pointer_b
汇编代码可能会澄清这个问题。 How can I prevent the Rust benchmark library from optimizing away my code?给出了从Rust基准测试中读取装配的一个很好的概述。
cargo rustc --release -- --emit asm
生成this output:
.file "fpinrust.cgu-0.rs"
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.long 0
.long 1
.long 2
.long 3
.long 4
.long 4
.long 4
.long 4
.long 8
.long 8
.long 8
.long 8
.long 12
.long 12
.long 12
.long 12
.long 16
.long 16
.long 16
.long 16
.long 20
.long 20
.long 20
.long 20
.long 24
.long 24
.long 24
.long 24
.long 28
.long 28
.long 28
.long 28
.long 32
.long 32
.long 32
.long 32
.section .text._ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E,@function
pushq %r14
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
subq $40, %rsp
.cfi_def_cfa_offset 64
.cfi_offset %rbx, -24
.cfi_offset %r14, -16
movq %rdi, %r14
leaq 24(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%r14), %r9
testq %r9, %r9
je .LBB0_15
xorl %ecx, %ecx
leaq 8(%rsp), %r8
movdqa .LCPI0_0(%rip), %xmm8
movdqa .LCPI0_1(%rip), %xmm15
movdqa .LCPI0_2(%rip), %xmm2
movdqa .LCPI0_3(%rip), %xmm9
movdqa .LCPI0_4(%rip), %xmm10
movdqa .LCPI0_5(%rip), %xmm11
movdqa .LCPI0_6(%rip), %xmm12
movdqa .LCPI0_7(%rip), %xmm13
movdqa .LCPI0_8(%rip), %xmm14
.p2align 4, 0x90
incq %rcx
movl $1000, 8(%rsp)
movl 8(%rsp), %esi
testl %esi, %esi
movl $0, %edx
jle .LBB0_14
xorl %edx, %edx
cmpl $8, %esi
jae .LBB0_5
xorl %edi, %edi
jmp .LBB0_13
.p2align 4, 0x90
movl %esi, %eax
andl $-8, %eax
movl $0, %edi
je .LBB0_13
leal -8(%rax), %edx
movl %edx, %edi
shrl $3, %edi
leal 1(%rdi), %ebx
andl $3, %ebx
pxor %xmm3, %xmm3
cmpl $24, %edx
pxor %xmm4, %xmm4
movdqa %xmm8, %xmm5
jb .LBB0_9
leal -1(%rbx), %edx
subl %edi, %edx
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
movdqa %xmm8, %xmm5
.p2align 4, 0x90
movdqa %xmm5, %xmm6
paddd %xmm15, %xmm6
movdqa %xmm5, %xmm7
paddd %xmm2, %xmm7
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6
movdqa %xmm5, %xmm4
paddd %xmm9, %xmm4
movdqa %xmm5, %xmm0
paddd %xmm10, %xmm0
pxor %xmm7, %xmm3
movdqa %xmm5, %xmm7
paddd %xmm11, %xmm7
movdqa %xmm5, %xmm1
paddd %xmm12, %xmm1
pxor %xmm4, %xmm7
pxor %xmm6, %xmm7
movdqa %xmm5, %xmm4
paddd %xmm13, %xmm4
paddd %xmm14, %xmm5
pxor %xmm0, %xmm1
pxor %xmm1, %xmm3
pxor %xmm7, %xmm4
addl $4, %edx
jne .LBB0_8
testl %ebx, %ebx
je .LBB0_12
negl %ebx
.p2align 4, 0x90
movdqa %xmm5, %xmm0
pxor %xmm5, %xmm3
paddd %xmm15, %xmm5
paddd %xmm2, %xmm0
pxor %xmm5, %xmm4
incl %ebx
movdqa %xmm0, %xmm5
jne .LBB0_11
pxor %xmm4, %xmm3
pshufd $78, %xmm3, %xmm0
pxor %xmm3, %xmm0
pshufd $229, %xmm0, %xmm1
pxor %xmm0, %xmm1
movd %xmm1, %edx
cmpl %eax, %esi
movl %eax, %edi
je .LBB0_14
.p2align 4, 0x90
xorl %edi, %edx
leal 1(%rdi), %eax
cmpl %eax, %esi
movl %eax, %edi
jne .LBB0_13
movl %edx, 8(%rsp)
cmpq %r9, %rcx
jne .LBB0_2
leaq 8(%rsp), %rdi
leaq 24(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq 8(%rsp), %rax
movl 16(%rsp), %ecx
movq %rax, 8(%r14)
movl %ecx, 16(%r14)
addq $40, %rsp
popq %rbx
popq %r14
.size _ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E, .Lfunc_end0-_ZN8fpinrust5tests13xor_closure_a17h4df097d1e565a700E
.section .text._ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E,@function
pushq %r14
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
subq $40, %rsp
.cfi_def_cfa_offset 64
.cfi_offset %rbx, -24
.cfi_offset %r14, -16
movq %rdi, %r14
leaq 24(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%r14), %rax
testq %rax, %rax
je .LBB1_5
xorl %ecx, %ecx
leaq 8(%rsp), %rdx
.p2align 4, 0x90
movl $1000, 8(%rsp)
movl 8(%rsp), %esi
xorl %ebx, %ebx
.p2align 4, 0x90
xorl %edi, %edi
cmpl %esi, %ebx
setl %dil
addl %ebx, %edi
cmpl %esi, %ebx
movl %edi, %ebx
jl .LBB1_3
incq %rcx
cmpq %rax, %rcx
jne .LBB1_2
leaq 8(%rsp), %rdi
leaq 24(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq 8(%rsp), %rax
movl 16(%rsp), %ecx
movq %rax, 8(%r14)
movl %ecx, 16(%r14)
addq $40, %rsp
popq %rbx
popq %r14
.size _ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E, .Lfunc_end1-_ZN8fpinrust5tests13xor_closure_b17h4bdd5e59e5c19a55E
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.long 0
.long 1
.long 2
.long 3
.long 4
.long 4
.long 4
.long 4
.long 8
.long 8
.long 8
.long 8
.long 12
.long 12
.long 12
.long 12
.long 16
.long 16
.long 16
.long 16
.long 20
.long 20
.long 20
.long 20
.long 24
.long 24
.long 24
.long 24
.long 28
.long 28
.long 28
.long 28
.long 32
.long 32
.long 32
.long 32
.section .text._ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E,@function
pushq %r14
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
subq $40, %rsp
.cfi_def_cfa_offset 64
.cfi_offset %rbx, -24
.cfi_offset %r14, -16
movq %rdi, %r14
leaq 24(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%r14), %r9
testq %r9, %r9
je .LBB2_15
xorl %ecx, %ecx
leaq 8(%rsp), %r8
movdqa .LCPI2_0(%rip), %xmm8
movdqa .LCPI2_1(%rip), %xmm15
movdqa .LCPI2_2(%rip), %xmm2
movdqa .LCPI2_3(%rip), %xmm9
movdqa .LCPI2_4(%rip), %xmm10
movdqa .LCPI2_5(%rip), %xmm11
movdqa .LCPI2_6(%rip), %xmm12
movdqa .LCPI2_7(%rip), %xmm13
movdqa .LCPI2_8(%rip), %xmm14
.p2align 4, 0x90
incq %rcx
movl $1000, 8(%rsp)
movl 8(%rsp), %esi
testl %esi, %esi
movl $0, %edx
jle .LBB2_14
xorl %edx, %edx
cmpl $8, %esi
jae .LBB2_5
xorl %edi, %edi
jmp .LBB2_13
.p2align 4, 0x90
movl %esi, %eax
andl $-8, %eax
movl $0, %edi
je .LBB2_13
leal -8(%rax), %edx
movl %edx, %edi
shrl $3, %edi
leal 1(%rdi), %ebx
andl $3, %ebx
pxor %xmm3, %xmm3
cmpl $24, %edx
pxor %xmm4, %xmm4
movdqa %xmm8, %xmm5
jb .LBB2_9
leal -1(%rbx), %edx
subl %edi, %edx
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
movdqa %xmm8, %xmm5
.p2align 4, 0x90
movdqa %xmm5, %xmm6
paddd %xmm15, %xmm6
movdqa %xmm5, %xmm7
paddd %xmm2, %xmm7
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6
movdqa %xmm5, %xmm4
paddd %xmm9, %xmm4
movdqa %xmm5, %xmm0
paddd %xmm10, %xmm0
pxor %xmm7, %xmm3
movdqa %xmm5, %xmm7
paddd %xmm11, %xmm7
movdqa %xmm5, %xmm1
paddd %xmm12, %xmm1
pxor %xmm4, %xmm7
pxor %xmm6, %xmm7
movdqa %xmm5, %xmm4
paddd %xmm13, %xmm4
paddd %xmm14, %xmm5
pxor %xmm0, %xmm1
pxor %xmm1, %xmm3
pxor %xmm7, %xmm4
addl $4, %edx
jne .LBB2_8
testl %ebx, %ebx
je .LBB2_12
negl %ebx
.p2align 4, 0x90
movdqa %xmm5, %xmm0
pxor %xmm5, %xmm3
paddd %xmm15, %xmm5
paddd %xmm2, %xmm0
pxor %xmm5, %xmm4
incl %ebx
movdqa %xmm0, %xmm5
jne .LBB2_11
pxor %xmm4, %xmm3
pshufd $78, %xmm3, %xmm0
pxor %xmm3, %xmm0
pshufd $229, %xmm0, %xmm1
pxor %xmm0, %xmm1
movd %xmm1, %edx
cmpl %eax, %esi
movl %eax, %edi
je .LBB2_14
.p2align 4, 0x90
xorl %edi, %edx
leal 1(%rdi), %eax
cmpl %eax, %esi
movl %eax, %edi
jne .LBB2_13
movl %edx, 8(%rsp)
cmpq %r9, %rcx
jne .LBB2_2
leaq 8(%rsp), %rdi
leaq 24(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq 8(%rsp), %rax
movl 16(%rsp), %ecx
movq %rax, 8(%r14)
movl %ecx, 16(%r14)
addq $40, %rsp
popq %rbx
popq %r14
.size _ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E, .Lfunc_end2-_ZN8fpinrust5tests13xor_pointer_a17hca6c00df2b597ce7E
.section .text._ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E,@function
pushq %rbx
.cfi_def_cfa_offset 16
subq $32, %rsp
.cfi_def_cfa_offset 48
.cfi_offset %rbx, -16
movq %rdi, %rbx
leaq 16(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%rbx), %rax
testq %rax, %rax
je .LBB3_3
leaq (%rsp), %rcx
.p2align 4, 0x90
movl $1000, (%rsp)
decq %rax
jne .LBB3_2
leaq (%rsp), %rdi
leaq 16(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq (%rsp), %rax
movl 8(%rsp), %ecx
movq %rax, 8(%rbx)
movl %ecx, 16(%rbx)
addq $32, %rsp
popq %rbx
.size _ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E, .Lfunc_end3-_ZN8fpinrust5tests13xor_pointer_b17h4ef66678d6655ef4E
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.long 0
.long 1
.long 2
.long 3
.long 4
.long 4
.long 4
.long 4
.long 8
.long 8
.long 8
.long 8
.long 12
.long 12
.long 12
.long 12
.long 16
.long 16
.long 16
.long 16
.long 20
.long 20
.long 20
.long 20
.long 24
.long 24
.long 24
.long 24
.long 28
.long 28
.long 28
.long 28
.long 32
.long 32
.long 32
.long 32
.long 36
.long 36
.long 36
.long 36
.long 40
.long 40
.long 40
.long 40
.section .text._ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E,"ax",@progbits
.globl _ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E
.p2align 4, 0x90
.type _ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E,@function
pushq %rbx
.cfi_def_cfa_offset 16
subq $32, %rsp
.cfi_def_cfa_offset 48
.cfi_offset %rbx, -16
movq %rdi, %rbx
leaq 16(%rsp), %rdi
callq _ZN3std4time7Instant3now17h37bccd496c61083dE@PLT
movq (%rbx), %rax
testq %rax, %rax
je .LBB4_5
xorl %ecx, %ecx
movdqa .LCPI4_1(%rip), %xmm9
movdqa .LCPI4_2(%rip), %xmm10
movdqa .LCPI4_3(%rip), %xmm11
movdqa .LCPI4_4(%rip), %xmm12
movdqa .LCPI4_5(%rip), %xmm13
movdqa .LCPI4_6(%rip), %xmm14
movdqa .LCPI4_7(%rip), %xmm15
movdqa .LCPI4_8(%rip), %xmm0
movdqa .LCPI4_9(%rip), %xmm1
movdqa .LCPI4_10(%rip), %xmm2
leaq (%rsp), %rdx
.p2align 4, 0x90
pxor %xmm3, %xmm3
movl $1000, %esi
pxor %xmm4, %xmm4
movdqa .LCPI4_0(%rip), %xmm5
.p2align 4, 0x90
movdqa %xmm5, %xmm6
paddd %xmm9, %xmm6
movdqa %xmm5, %xmm7
paddd %xmm10, %xmm7
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6
movdqa %xmm5, %xmm4
paddd %xmm11, %xmm4
pxor %xmm7, %xmm3
movdqa %xmm5, %xmm7
paddd %xmm13, %xmm7
pxor %xmm4, %xmm7
movdqa %xmm5, %xmm4
paddd %xmm12, %xmm4
pxor %xmm6, %xmm7
movdqa %xmm5, %xmm6
paddd %xmm14, %xmm6
pxor %xmm4, %xmm6
movdqa %xmm5, %xmm8
paddd %xmm15, %xmm8
pxor %xmm6, %xmm3
movdqa %xmm5, %xmm4
paddd %xmm0, %xmm4
pxor %xmm4, %xmm3
movdqa %xmm5, %xmm4
paddd %xmm1, %xmm4
pxor %xmm8, %xmm4
pxor %xmm7, %xmm4
paddd %xmm2, %xmm5
addl $-40, %esi
jne .LBB4_3
pxor %xmm3, %xmm4
pshufd $78, %xmm4, %xmm3
pxor %xmm4, %xmm3
pshufd $229, %xmm3, %xmm4
pxor %xmm3, %xmm4
incq %rcx
movd %xmm4, (%rsp)
cmpq %rax, %rcx
jne .LBB4_2
leaq (%rsp), %rdi
leaq 16(%rsp), %rsi
callq _ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE@PLT
movq (%rsp), %rax
movl 8(%rsp), %ecx
movq %rax, 8(%rbx)
movl %ecx, 16(%rbx)
addq $32, %rsp
popq %rbx
.size _ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E, .Lfunc_end4-_ZN8fpinrust5tests13xor_black_box17h8af7e6f80fe83dc8E
.section ".note.GNU-stack","",@progbits
rustc --version
rustc 1.13.0-nightly (378195665 2016-09-08)
cargo --version
cargo 0.13.0-nightly (afaffa1 2016-09-06)
cargo rustc --release -- --test --emit asm
./target/release/deps/xor --bench
running 2 tests
test tests::xor_closure_b ... bench: 925 ns/iter (+/- 9)
test tests::xor_pointer_b ... bench: 0 ns/iter (+/- 0)
test result: ok. 0 passed; 0 failed; 0 ignored; 2 measured
export RUSTLIB=/path/to/lib/rustlib
gcc target/release/deps/xor.s $RUSTLIB/x86_64-unknown-linux-gnu/lib/*.rlib $RUSTLIB/x86_64-unknown-linux-gnu/lib/*.so -pthread -lpthread -lm -ldl
./a.out --bench
running 2 tests
test tests::xor_closure_b ... bench: 1 ns/iter (+/- 0)
test tests::xor_pointer_b ... bench: 0 ns/iter (+/- 0)
test result: ok. 0 passed; 0 failed; 0 ignored; 2 measured
答案 0 :(得分:4)
首先,让我们看一下为这两个函数生成的LLVM IR。 (我发现LLVM IR比ASM更容易阅读,因为它更结构化。)
; Function Attrs: uwtable
define internal void @_ZN3xor13xor_closure_b17hb13913a8d2a27b06E(%"11.test::Bencher"* nocapture dereferenceable(32)) unnamed_addr #0 personality i32 (i32, i32, i64, %"8.unwind::libunwind::_Unwind_Exception"*, %"8.unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
%dummy.i.i = alloca {}, align 8
%dummy.i.i.i = alloca i32, align 4
%start1.i = alloca %"1.std::time::Instant", align 8
%tmp_ret2.i = alloca %"1.std::time::Duration", align 8
%1 = bitcast %"1.std::time::Duration"* %tmp_ret2.i to i8*
call void @llvm.lifetime.start(i64 16, i8* %1)
%2 = bitcast %"1.std::time::Instant"* %start1.i to i8*
call void @llvm.lifetime.start(i64 16, i8* %2)
call void @_ZN3std4time7Instant3now17h37bccd496c61083dE(%"1.std::time::Instant"* noalias nocapture nonnull sret dereferenceable(16) %start1.i)
%3 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 0
%4 = load i64, i64* %3, align 8
%5 = icmp eq i64 %4, 0
br i1 %5, label %_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit, label %bb7.lr.ph.i
bb7.lr.ph.i: ; preds = %entry-block
%6 = bitcast i32* %dummy.i.i.i to i8*
%7 = bitcast {}* %dummy.i.i to i8*
br label %bb7.i
bb7.i: ; preds = %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i", %bb7.lr.ph.i
%iter.sroa.0.019.i = phi i64 [ 0, %bb7.lr.ph.i ], [ %11, %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i" ]
call void @llvm.lifetime.start(i64 4, i8* %6) #2
store i32 1000, i32* %dummy.i.i.i, align 4
call void asm "", "r,~{dirflag},~{fpsr},~{flags}"(i32* nonnull %dummy.i.i.i) #2, !srcloc !1
%8 = load i32, i32* %dummy.i.i.i, align 4
call void @llvm.lifetime.end(i64 4, i8* %6) #2
br label %bb7.i.i.i
bb7.i.i.i: ; preds = %bb7.i.i.i, %bb7.i
%iter.sroa.0.0.i.i.i = phi i32 [ 0, %bb7.i ], [ %iter.sroa.0.1.i.i.i, %bb7.i.i.i ]
%9 = icmp slt i32 %iter.sroa.0.0.i.i.i, %8
%10 = zext i1 %9 to i32
%iter.sroa.0.1.i.i.i = add i32 %10, %iter.sroa.0.0.i.i.i
br i1 %9, label %bb7.i.i.i, label %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i"
"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i": ; preds = %bb7.i.i.i
%11 = add nuw i64 %iter.sroa.0.019.i, 1
call void @llvm.lifetime.start(i64 0, i8* %7)
call void asm "", "r,~{dirflag},~{fpsr},~{flags}"({}* nonnull %dummy.i.i) #2, !srcloc !1
call void @llvm.lifetime.end(i64 0, i8* %7)
%exitcond.i = icmp eq i64 %11, %4
br i1 %exitcond.i, label %_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit.loopexit, label %bb7.i
_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit.loopexit: ; preds = %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i"
br label %_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit
_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit: ; preds = %_ZN4test7Bencher4iter17h0cab611e22e5c5faE.exit.loopexit, %entry-block
call void @_ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE(%"1.std::time::Duration"* noalias nocapture nonnull sret dereferenceable(16) %tmp_ret2.i, %"1.std::time::Instant"* noalias nonnull readonly dereferenceable(16) %start1.i)
%12 = getelementptr inbounds %"1.std::time::Duration", %"1.std::time::Duration"* %tmp_ret2.i, i64 0, i32 0
%13 = getelementptr inbounds %"1.std::time::Duration", %"1.std::time::Duration"* %tmp_ret2.i, i64 0, i32 1
%14 = load i64, i64* %12, align 8
%15 = load i32, i32* %13, align 8
%16 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 1, i32 0
store i64 %14, i64* %16, align 8
%17 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 1, i32 1
store i32 %15, i32* %17, align 4
call void @llvm.lifetime.end(i64 16, i8* %2)
call void @llvm.lifetime.end(i64 16, i8* %1)
ret void
; Function Attrs: uwtable
define internal void @_ZN3xor13xor_pointer_b17h7ba0f9760d9fd9f8E(%"11.test::Bencher"* nocapture dereferenceable(32)) unnamed_addr #0 personality i32 (i32, i32, i64, %"8.unwind::libunwind::_Unwind_Exception"*, %"8.unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
%dummy.i.i = alloca {}, align 8
%dummy.i.i.i = alloca i32, align 4
%start1.i = alloca %"1.std::time::Instant", align 8
%tmp_ret2.i = alloca %"1.std::time::Duration", align 8
%1 = bitcast %"1.std::time::Duration"* %tmp_ret2.i to i8*
call void @llvm.lifetime.start(i64 16, i8* %1)
%2 = bitcast %"1.std::time::Instant"* %start1.i to i8*
call void @llvm.lifetime.start(i64 16, i8* %2)
call void @_ZN3std4time7Instant3now17h37bccd496c61083dE(%"1.std::time::Instant"* noalias nocapture nonnull sret dereferenceable(16) %start1.i)
%3 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 0
%4 = load i64, i64* %3, align 8
%5 = icmp eq i64 %4, 0
br i1 %5, label %_ZN4test7Bencher4iter17hae343b1316e5897bE.exit, label %bb7.lr.ph.i
bb7.lr.ph.i: ; preds = %entry-block
%6 = bitcast i32* %dummy.i.i.i to i8*
%7 = bitcast {}* %dummy.i.i to i8*
br label %bb7.i
bb7.i: ; preds = %bb7.i, %bb7.lr.ph.i
%iter.sroa.0.019.i = phi i64 [ 0, %bb7.lr.ph.i ], [ %8, %bb7.i ]
%8 = add nuw i64 %iter.sroa.0.019.i, 1
call void @llvm.lifetime.start(i64 4, i8* %6) #2
store i32 1000, i32* %dummy.i.i.i, align 4
call void asm "", "r,~{dirflag},~{fpsr},~{flags}"(i32* nonnull %dummy.i.i.i) #2, !srcloc !1
call void @llvm.lifetime.end(i64 4, i8* %6) #2
call void @llvm.lifetime.start(i64 0, i8* %7)
call void asm "", "r,~{dirflag},~{fpsr},~{flags}"({}* nonnull %dummy.i.i) #2, !srcloc !1
call void @llvm.lifetime.end(i64 0, i8* %7)
%exitcond.i = icmp eq i64 %8, %4
br i1 %exitcond.i, label %_ZN4test7Bencher4iter17hae343b1316e5897bE.exit.loopexit, label %bb7.i
_ZN4test7Bencher4iter17hae343b1316e5897bE.exit.loopexit: ; preds = %bb7.i
br label %_ZN4test7Bencher4iter17hae343b1316e5897bE.exit
_ZN4test7Bencher4iter17hae343b1316e5897bE.exit: ; preds = %_ZN4test7Bencher4iter17hae343b1316e5897bE.exit.loopexit, %entry-block
call void @_ZN3std4time7Instant7elapsed17h0b6076720ddfcc2bE(%"1.std::time::Duration"* noalias nocapture nonnull sret dereferenceable(16) %tmp_ret2.i, %"1.std::time::Instant"* noalias nonnull readonly dereferenceable(16) %start1.i)
%9 = getelementptr inbounds %"1.std::time::Duration", %"1.std::time::Duration"* %tmp_ret2.i, i64 0, i32 0
%10 = getelementptr inbounds %"1.std::time::Duration", %"1.std::time::Duration"* %tmp_ret2.i, i64 0, i32 1
%11 = load i64, i64* %9, align 8
%12 = load i32, i32* %10, align 8
%13 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 1, i32 0
store i64 %11, i64* %13, align 8
%14 = getelementptr inbounds %"11.test::Bencher", %"11.test::Bencher"* %0, i64 0, i32 1, i32 1
store i32 %12, i32* %14, align 4
call void @llvm.lifetime.end(i64 16, i8* %2)
call void @llvm.lifetime.end(i64 16, i8* %1)
ret void
的LLVM IR,它们看起来非常相似。但是,有一个区别是:bb7.i.i.i
bb7.i.i.i: ; preds = %bb7.i.i.i, %bb7.i
%iter.sroa.0.0.i.i.i = phi i32 [ 0, %bb7.i ], [ %iter.sroa.0.1.i.i.i, %bb7.i.i.i ]
%9 = icmp slt i32 %iter.sroa.0.0.i.i.i, %8
%10 = zext i1 %9 to i32
%iter.sroa.0.1.i.i.i = add i32 %10, %iter.sroa.0.0.i.i.i
br i1 %9, label %bb7.i.i.i, label %"_ZN3xor13xor_closure_b28_$u7b$$u7b$closure$u7d$$u7d$17hbaf8d82981c57ba0E.exit.i"
xorl %edi, %edi
cmpl %esi, %ebx
setl %dil
addl %ebx, %edi
cmpl %esi, %ebx
movl %edi, %ebx
jl .LBB1_3
cmpl %esi, %ebx
jge .LBB1_3a
incl %ebx
cmpl %esi, %ebx
jl .LBB1_3
的基准在我的计算机上从781 ns/iter (+/- 19)
下降到270 ns/iter (+/- 7)
更快,因为生成的代码是矢量化的,这会导致循环执行更少的迭代(即循环展开),因子为32(例如{{ 1}},.LBB0_8
我用$ gcc target/release/xor-71758a2519026d86.s ~/.multirust/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib/lib{test,term,getopts,rustc_unicode,std,libc,rand,collections,alloc_system,alloc,core,panic_unwind}-411f48d3.rlib -pthread -lpthread -lm -ldl
运行它。另外,我的CPU是Intel Core i7-4770K。