在从数学到通用编程这本书的推动下,我正在使用函数和不同的整数大小。
我有两种不同的素数筛选实现,可以与u16
或u32
一起使用。我使用cargo bench
对它们进行基准测试,u16
方法总是比u32
方法快一点。
为什么会这样?我的假设是我的处理器(i5-7300u)能够同时为u16
执行两次添加,但不能用于u32
或u64
。然而,我不知道如何验证这一点。我已经加入了大会。
基准测试结果
test tests :: bench_sift2 ... bench:74,093 ns / iter(+/- 3,765)
test tests :: bench_sift2_u16 ... bench:61,136 ns / iter(+/- 3,389)
修改
使用不同的数组大小并使用布尔数组而不是向量的想法导致约。两种功能的速度相同。实际上,当两个矢量的大小都是1 <&lt; 15时,性能差异才显着。
编辑2
一些有趣的观察:我在Windows 10 Pro 10.0.1的Windows平板电脑上运行此代码。或多或少意外,我只是使用不同的省电配置运行基准测试。当我将配置设置为最高性能时,我会看到或多或少的结果报告如下。如果我将配置设置为任何其他级别,我会看到两个函数看起来行为相同的结果,但测量误差会急剧增加。
Rust Code
#![feature(iterator_step_by)]
#![feature(test)]
extern crate test;
fn main() {
let vec = sift2(1 << 15);
// let vec = sift2_u16(1 << 15);
println!("{}",vec[0]);
}
fn sift2(n: usize) -> Vec<bool> {
let mut vec = vec![true; n];
let mut i = 0;
let mut index_square = 3;
let mut factor = 3;
while index_square < n {
if vec[i] {
mark_sieve(&mut vec[index_square..], factor);
}
i += 1;
index_square += factor;
factor += 2;
index_square += factor;
}
vec
}
fn sift2_u16(n: u16) -> Vec<bool> {
let mut vec = vec![true; n as usize];
let mut i: u16 = 0;
let mut index_square: u16 = 3;
let mut factor: u16 = 3;
while index_square < n {
if vec[i as usize] {
mark_sieve(&mut vec[index_square as usize..], factor as usize);
}
i += 1;
index_square += factor;
factor += 2;
index_square += factor;
}
vec
}
fn mark_sieve(data: &mut [bool], factor: usize) {
data.iter_mut().step_by(factor).for_each(|k| *k = false);
}
#[cfg(test)]
mod tests {
use super::*;
use test::{black_box, Bencher};
#[bench]
fn bench_sift2(b: &mut Bencher) {
b.iter(|| sift2(1 << 15));
}
#[bench]
fn bench_sift2_u16(b: &mut Bencher) {
b.iter(|| sift2_u16(1 << 15));
}
}
为sift2生成程序集
.text
.def _ZN3std2rt10lang_start17h0092a1d276f89f87E;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.globl _ZN3std2rt10lang_start17h0092a1d276f89f87E
.p2align 4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %r8, %r9
movq %rdx, %rax
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r8
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.seh_endproc
.def _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.p2align 4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *(%rcx)
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.seh_endproc
.def _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.p2align 4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *%rcx
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.seh_endproc
.def _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
movq 8(%rcx), %rdx
testq %rdx, %rdx
je .LBB3_1
movq (%rcx), %rcx
movl $1, %r8d
jmp __rust_dealloc
.LBB3_1:
retq
.def _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
retq
.def _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.p2align 4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq __rust_oom
ud2
.seh_handlerdata
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.seh_endproc
.def _ZN8chapter34main17hfb06448c1bac2398E;
.scl 3;
.type 32;
.endef
.globl __xmm@00000000000080000000000000008000
.section .rdata,"dr",discard,__xmm@00000000000080000000000000008000
.p2align 4
__xmm@00000000000080000000000000008000:
.quad 32768
.quad 32768
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
.seh_handler __CxxFrameHandler3, @unwind, @except
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $128, %rsp
.seh_stackalloc 128
leaq 128(%rsp), %rbp
.seh_setframe 5, 128
.seh_endprologue
movq $-2, -8(%rbp)
leaq -56(%rbp), %r8
movl $32768, %ecx
movl $1, %edx
callq __rust_alloc
movq %rax, %rdi
testq %rdi, %rdi
je .LBB6_21
movl $32768, %r14d
movl $1, %edx
movl $32768, %r8d
movq %rdi, %rcx
callq memset
movq %rdi, -56(%rbp)
movaps __xmm@00000000000080000000000000008000(%rip), %xmm0
movups %xmm0, -48(%rbp)
xorl %edx, %edx
movl $3, %eax
movl $3, %ecx
cmpb $0, (%rdi,%rdx)
jne .LBB6_3
jmp .LBB6_10
.p2align 4, 0x90
.LBB6_12:
addq $2, %rax
movq -56(%rbp), %rdi
cmpb $0, (%rdi,%rdx)
je .LBB6_10
.LBB6_3:
cmpq %rcx, %r14
jb .LBB6_4
cmpq %rcx, %r14
je .LBB6_10
addq %rdi, %r14
leaq (%rdi,%rcx), %rdi
leaq -1(%rax), %rsi
addq $1, %rdi
.p2align 4, 0x90
.LBB6_9:
movb $0, -1(%rdi)
movq %r14, %rbx
subq %rdi, %rbx
addq %rax, %rdi
cmpq %rsi, %rbx
ja .LBB6_9
.LBB6_10:
addq %rax, %rcx
addq %rax, %rcx
addq $2, %rcx
cmpq $32767, %rcx
ja .LBB6_14
addq $1, %rdx
movq -40(%rbp), %r14
cmpq %rdx, %r14
ja .LBB6_12
.Ltmp6:
leaq panic_bounds_check_loc.j(%rip), %rcx
movq %r14, %r8
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
jmp .LBB6_6
.LBB6_14:
movq -40(%rbp), %rax
movq %rax, -64(%rbp)
movups -56(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
cmpq $0, -64(%rbp)
je .LBB6_15
movq -80(%rbp), %rsi
movq %rsi, -96(%rbp)
leaq _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
movq %rax, -88(%rbp)
leaq ref.m(%rip), %rax
movq %rax, -56(%rbp)
movq $2, -48(%rbp)
leaq ref.n(%rip), %rax
movq %rax, -40(%rbp)
movq $1, -32(%rbp)
leaq -96(%rbp), %rax
movq %rax, -24(%rbp)
movq $1, -16(%rbp)
.Ltmp2:
leaq -56(%rbp), %rcx
callq _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp3:
movq -72(%rbp), %rdx
testq %rdx, %rdx
je .LBB6_19
movl $1, %r8d
movq %rsi, %rcx
callq __rust_dealloc
.LBB6_19:
nop
addq $128, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.LBB6_4:
.Ltmp0:
movq %r14, %rdx
callq _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp1:
.LBB6_6:
ud2
.LBB6_21:
movups -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -80(%rbp), %xmm0
movups %xmm0, -48(%rbp)
leaq -56(%rbp), %rcx
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
ud2
.LBB6_15:
.Ltmp4:
leaq panic_bounds_check_loc.j(%rip), %rcx
xorl %edx, %edx
xorl %r8d, %r8d
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp5:
jmp .LBB6_6
.seh_handlerdata
.long ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_13:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -56(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_20:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -80(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.Lfunc_end0:
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.section .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
.long 429065506
.long 2
.long ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 0
.long 0
.long 6
.long ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 120
.long 0
.long 1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
.long -1
.long "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
.long -1
.long "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
.long .Lfunc_begin0@IMGREL
.long -1
.long .Ltmp6@IMGREL+1
.long 0
.long .Ltmp2@IMGREL+1
.long 1
.long .Ltmp0@IMGREL+1
.long 0
.long .Ltmp4@IMGREL+1
.long 1
.long .Ltmp5@IMGREL+1
.long -1
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.def main;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,main
.globl main
.p2align 4, 0x90
main:
.seh_proc main
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %rdx, %rax
movslq %ecx, %r8
leaq _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r9
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,main
.seh_endproc
.section .rdata,"dr",one_only,vtable.4
.p2align 3
vtable.4:
.quad _ZN4core3ptr13drop_in_place17hd909dec568d984beE
.quad 8
.quad 8
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.section .rdata,"dr",one_only,str.i
.p2align 4
str.i:
.ascii "C:\\projects\\rust\\src\\liballoc\\vec.rs"
.section .rdata,"dr",one_only,panic_bounds_check_loc.j
.p2align 3
panic_bounds_check_loc.j:
.quad str.i
.quad 36
.long 1551
.long 10
.section .rdata,"dr",one_only,str.k
str.k:
.section .rdata,"dr",one_only,str.l
str.l:
.byte 10
.section .rdata,"dr",one_only,ref.m
.p2align 3
ref.m:
.quad str.k
.quad 0
.quad str.l
.quad 1
.section .rdata,"dr",one_only,ref.n
.p2align 3
ref.n:
.quad 1
.quad 0
.quad 3
.zero 8
.quad 3
.zero 8
.long 32
.long 0
.byte 3
.zero 7
为sift2_u16生成的程序集
U16
.text
.def _ZN3std2rt10lang_start17h0092a1d276f89f87E;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.globl _ZN3std2rt10lang_start17h0092a1d276f89f87E
.p2align 4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %r8, %r9
movq %rdx, %rax
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r8
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.seh_endproc
.def _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.p2align 4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *(%rcx)
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.seh_endproc
.def _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.p2align 4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *%rcx
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.seh_endproc
.def _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
movq 8(%rcx), %rdx
testq %rdx, %rdx
je .LBB3_1
movq (%rcx), %rcx
movl $1, %r8d
jmp __rust_dealloc
.LBB3_1:
retq
.def _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
retq
.def _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.p2align 4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq __rust_oom
ud2
.seh_handlerdata
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.seh_endproc
.def _ZN8chapter34main17hfb06448c1bac2398E;
.scl 3;
.type 32;
.endef
.globl __xmm@00000000000080000000000000008000
.section .rdata,"dr",discard,__xmm@00000000000080000000000000008000
.p2align 4
__xmm@00000000000080000000000000008000:
.quad 32768
.quad 32768
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
.seh_handler __CxxFrameHandler3, @unwind, @except
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $128, %rsp
.seh_stackalloc 128
leaq 128(%rsp), %rbp
.seh_setframe 5, 128
.seh_endprologue
movq $-2, -8(%rbp)
leaq -56(%rbp), %r8
movl $32768, %ecx
movl $1, %edx
callq __rust_alloc
movq %rax, %r14
testq %r14, %r14
je .LBB6_23
movl $32768, %edi
movl $1, %edx
movl $32768, %r8d
movq %r14, %rcx
callq memset
movq %r14, -56(%rbp)
movaps __xmm@00000000000080000000000000008000(%rip), %xmm0
movups %xmm0, -48(%rbp)
movw $3, %r8w
xorl %edx, %edx
movw $3, %r9w
cmpb $0, (%r14,%rdx)
jne .LBB6_3
jmp .LBB6_12
.p2align 4, 0x90
.LBB6_14:
movq -56(%rbp), %r14
cmpb $0, (%r14,%rdx)
je .LBB6_12
.LBB6_3:
movzwl %r9w, %ecx
cmpq %rcx, %rdi
jb .LBB6_4
testw %r8w, %r8w
je .LBB6_8
cmpq %rcx, %rdi
je .LBB6_12
addq %r14, %rcx
movzwl %r8w, %ebx
addq %r14, %rdi
leaq -1(%rbx), %rax
addq $1, %rcx
.p2align 4, 0x90
.LBB6_11:
movb $0, -1(%rcx)
movq %rdi, %rsi
subq %rcx, %rsi
addq %rbx, %rcx
cmpq %rax, %rsi
ja .LBB6_11
.LBB6_12:
addl %r8d, %r9d
addl $2, %r8d
addw %r8w, %r9w
js .LBB6_16
addq $1, %rdx
movq -40(%rbp), %rdi
cmpq %rdx, %rdi
ja .LBB6_14
.Ltmp8:
leaq panic_bounds_check_loc.j(%rip), %rcx
movq %rdi, %r8
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp9:
jmp .LBB6_6
.LBB6_16:
movq -40(%rbp), %rax
movq %rax, -64(%rbp)
movups -56(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
cmpq $0, -64(%rbp)
je .LBB6_17
movq -80(%rbp), %rsi
movq %rsi, -96(%rbp)
leaq _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
movq %rax, -88(%rbp)
leaq ref.m(%rip), %rax
movq %rax, -56(%rbp)
movq $2, -48(%rbp)
leaq ref.n(%rip), %rax
movq %rax, -40(%rbp)
movq $1, -32(%rbp)
leaq -96(%rbp), %rax
movq %rax, -24(%rbp)
movq $1, -16(%rbp)
.Ltmp4:
leaq -56(%rbp), %rcx
callq _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp5:
movq -72(%rbp), %rdx
testq %rdx, %rdx
je .LBB6_21
movl $1, %r8d
movq %rsi, %rcx
callq __rust_dealloc
.LBB6_21:
nop
addq $128, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.LBB6_4:
.Ltmp2:
movq %rdi, %rdx
callq _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp3:
jmp .LBB6_6
.LBB6_8:
.Ltmp0:
leaq ref.b(%rip), %rcx
callq _ZN4core9panicking5panic17h42feaa2e0dc2c607E
.Ltmp1:
.LBB6_6:
ud2
.LBB6_23:
movups -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -80(%rbp), %xmm0
movups %xmm0, -48(%rbp)
leaq -56(%rbp), %rcx
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
ud2
.LBB6_17:
.Ltmp6:
leaq panic_bounds_check_loc.j(%rip), %rcx
xorl %edx, %edx
xorl %r8d, %r8d
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
jmp .LBB6_6
.seh_handlerdata
.long ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_15:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -56(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_22:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -80(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.Lfunc_end0:
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.section .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
.long 429065506
.long 2
.long ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 0
.long 0
.long 6
.long ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 120
.long 0
.long 1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
.long -1
.long "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
.long -1
.long "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
.long .Lfunc_begin0@IMGREL
.long -1
.long .Ltmp8@IMGREL+1
.long 0
.long .Ltmp4@IMGREL+1
.long 1
.long .Ltmp2@IMGREL+1
.long 0
.long .Ltmp6@IMGREL+1
.long 1
.long .Ltmp7@IMGREL+1
.long -1
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.def main;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,main
.globl main
.p2align 4, 0x90
main:
.seh_proc main
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %rdx, %rax
movslq %ecx, %r8
leaq _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r9
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,main
.seh_endproc
.section .rdata,"dr",one_only,vtable.4
.p2align 3
vtable.4:
.quad _ZN4core3ptr13drop_in_place17hd909dec568d984beE
.quad 8
.quad 8
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.section .rdata,"dr",one_only,str.9
.p2align 4
str.9:
.ascii "assertion failed: step != 0"
.section .rdata,"dr",one_only,str.a
.p2align 4
str.a:
.ascii "libcore\\iter\\iterator.rs"
.section .rdata,"dr",one_only,ref.b
.p2align 3
ref.b:
.quad str.9
.quad 27
.quad str.a
.quad 24
.long 299
.long 9
.section .rdata,"dr",one_only,str.i
.p2align 4
str.i:
.ascii "C:\\projects\\rust\\src\\liballoc\\vec.rs"
.section .rdata,"dr",one_only,panic_bounds_check_loc.j
.p2align 3
panic_bounds_check_loc.j:
.quad str.i
.quad 36
.long 1551
.long 10
.section .rdata,"dr",one_only,str.k
str.k:
.section .rdata,"dr",one_only,str.l
str.l:
.byte 10
.section .rdata,"dr",one_only,ref.m
.p2align 3
ref.m:
.quad str.k
.quad 0
.quad str.l
.quad 1
.section .rdata,"dr",one_only,ref.n
.p2align 3
ref.n:
.quad 1
.quad 0
.quad 3
.zero 8
.quad 3
.zero 8
.long 32
.long 0
.byte 3
.zero 7
答案 0 :(得分:3)
我从未尝试过Rust,但我知道一些很好的工具可以进行这样的性能分析。因此,虽然它可能无法完全回答您的问题,但您将获得更多信息。
在尝试理解低级别性能时,必须查看生成的程序集,您似乎已通过所提供的输出完成该程序集。但是,这是非常难以理解的。这就是我的偏好工具进入游戏的地方:Compiler Explorer。您可以看到your code here
从生成的程序集中,我们看到了一些差异。让我们只关注循环的两个元素(你可以检查其他元素,但想法和结果是一样的)
测试而为32位:
mov rax, qword ptr [rbp - 112]
cmp qword ptr [rbp - 64], rax
jb .LBB124_5
16位相同:
mov ax, word ptr [rbp - 98]
cmp word ptr [rbp - 52], ax
jb .LBB125_5
32中的mark_sieve :
.LBB124_8:
mov rax, qword ptr [rbp - 64]
mov qword ptr [rbp - 48], rax
mov rsi, qword ptr [rbp - 48]
lea rdi, [rbp - 96]
call <alloc::vec::Vec<T> as core::ops::index::IndexMut<core::ops::range::RangeFrom<usize>>>::index_mut
mov qword ptr [rbp - 136], rdx
mov qword ptr [rbp - 144], rax
jmp .LBB124_9
.LBB124_9:
mov rdx, qword ptr [rbp - 56]
mov rdi, qword ptr [rbp - 144]
mov rsi, qword ptr [rbp - 136]
call example::mark_sieve
jmp .LBB124_10
并且在16:
.LBB125_8:
movzx eax, word ptr [rbp - 52]
mov ecx, eax
mov qword ptr [rbp - 48], rcx
mov rsi, qword ptr [rbp - 48]
lea rdi, [rbp - 80]
call <alloc::vec::Vec<T> as core::ops::index::IndexMut<core::ops::range::RangeFrom<usize>>>::index_mut
mov qword ptr [rbp - 120], rdx
mov qword ptr [rbp - 128], rax
jmp .LBB125_9
.LBB125_9:
movzx eax, word ptr [rbp - 50]
mov edx, eax
mov rdi, qword ptr [rbp - 128]
mov rsi, qword ptr [rbp - 120]
call example::mark_sieve
jmp .LBB125_10
我们可以在此代码中看到一些差异:
对于这些不同的说明,您可以看到它们的相对执行时间差异,看着优秀的Agner Instruction Tables。 (我很乐意看到它们集成在编译器资源管理器btw中)。 你的CPU似乎是一个KabyLake(所以我们将使用Skylake架构)所以我们将从第231页开始获取表格(阅读此页面以获取表格中使用的缩写)。
从Agner表中, mov r64,m 和 movzx r,m 之间的uops数相同(1 uop),但mov加2延迟周期。
不同的累加器也会改变处理器的一些优化。
编译器还执行其他体系结构相关的优化,例如循环展开,具体取决于CPU上可用的ALU数量。因此,根据编译器的决定,您的代码在不同的CPU之间可能会有不同的行为。
差异也可能是由于代码对齐或缓存优化造成的。
关于电源管理差异,这可能会受到两个因素的影响:频次上限和C状态管理。 cstates是cpu在短时间内进入不同睡眠状态的状态。睡眠/唤醒对cpu内部的影响取决于固件。因此,我们无法真正详细检查(分析也会改变结果)。
我做了a post about understanding Meltdown/Spectre attack,它解释了CPU可以在引擎盖下做的不同优化(甚至组件都无法反映)。您可能还想看看它以更好地理解为什么CPU优化很难,因为我们无法控制很多参数。
快乐的黑客攻击!