为什么具有不同整数参数大小的相同函数更快?

时间:2018-03-03 13:26:20

标签: performance rust

从数学到通用编程这本书的推动下,我正在使用函数和不同的整数大小。

我有两种不同的素数筛选实现,可以与u16u32一起使用。我使用cargo bench对它们进行基准测试,u16方法总是比u32方法快一点。

为什么会这样?我的假设是我的处理器(i5-7300u)能够同时为u16执行两次添加,但不能用于u32u64。然而,我不知道如何验证这一点。我已经加入了大会。

基准测试结果

  

test tests :: bench_sift2 ... bench:74,093 ns / iter(+/- 3,765)

     

test tests :: bench_sift2_u16 ... bench:61,136 ns / iter(+/- 3,389)

修改

使用不同的数组大小并使用布尔数组而不是向量的想法导致约。两种功能的速度相同。实际上,当两个矢量的大小都是1 <&lt; 15时,性能差异才显着。

编辑2

一些有趣的观察:我在Windows 10 Pro 10.0.1的Windows平板电脑上运行此代码。或多或少意外,我只是使用不同的省电配置运行基准测试。当我将配置设置为最高性能时,我会看到或多或少的结果报告如下。如果我将配置设置为任何其他级别,我会看到两个函数看起来行为相同的结果,但测量误差会急剧增加。

Rust Code

#![feature(iterator_step_by)]
#![feature(test)]

extern crate test;

fn main() {
   let vec = sift2(1 << 15);
   // let vec = sift2_u16(1 << 15);
   println!("{}",vec[0]);
}


fn sift2(n: usize) -> Vec<bool> {
    let mut vec = vec![true; n];

    let mut i = 0;
    let mut index_square = 3;
    let mut factor = 3;

    while index_square < n {
        if vec[i] {
            mark_sieve(&mut vec[index_square..], factor);
        }
        i += 1;

        index_square += factor;
        factor += 2;
        index_square += factor;
    }

    vec
}

fn sift2_u16(n: u16) -> Vec<bool> {
    let mut vec = vec![true; n as usize];

    let mut i: u16 = 0;
    let mut index_square: u16 = 3;
    let mut factor: u16 = 3;

    while index_square < n {
        if vec[i as usize] {
            mark_sieve(&mut vec[index_square as usize..], factor as usize);
        }
        i += 1;

        index_square += factor;
        factor += 2;
        index_square += factor;
    }

    vec
}

fn mark_sieve(data: &mut [bool], factor: usize) {
    data.iter_mut().step_by(factor).for_each(|k| *k = false);
}

#[cfg(test)]
mod tests {

    use super::*;
    use test::{black_box, Bencher};

    #[bench]
    fn bench_sift2(b: &mut Bencher) {
        b.iter(|| sift2(1 << 15));
    }

    #[bench]
    fn bench_sift2_u16(b: &mut Bencher) {
        b.iter(|| sift2_u16(1 << 15));
    }
}

为sift2生成程序集

    .text
    .def     _ZN3std2rt10lang_start17h0092a1d276f89f87E;
    .scl    2;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
    .globl  _ZN3std2rt10lang_start17h0092a1d276f89f87E
    .p2align    4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    movq    %r8, %r9
    movq    %rdx, %rax
    movq    %rcx, 32(%rsp)
    leaq    vtable.4(%rip), %rdx
    leaq    32(%rsp), %rcx
    movq    %rax, %r8
    callq   _ZN3std2rt19lang_start_internal17h273003faf754a099E
    nop
    addq    $40, %rsp
    retq
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
    .seh_endproc

    .def     _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    .p2align    4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    callq   *(%rcx)
    nop
    addq    $40, %rsp
    jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    .seh_endproc

    .def     _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
    .p2align    4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    callq   *%rcx
    nop
    addq    $40, %rsp
    jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
    .seh_endproc

    .def     _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
    .p2align    4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
    movq    8(%rcx), %rdx
    testq   %rdx, %rdx
    je  .LBB3_1
    movq    (%rcx), %rcx
    movl    $1, %r8d
    jmp __rust_dealloc
.LBB3_1:
    retq

    .def     _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
    .p2align    4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
    retq

    .def     _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
    .p2align    4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    callq   __rust_oom
    ud2
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
    .seh_endproc

    .def     _ZN8chapter34main17hfb06448c1bac2398E;
    .scl    3;
    .type   32;
    .endef
    .globl  __xmm@00000000000080000000000000008000
    .section    .rdata,"dr",discard,__xmm@00000000000080000000000000008000
    .p2align    4
__xmm@00000000000080000000000000008000:
    .quad   32768
    .quad   32768
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
    .p2align    4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
    .seh_handler __CxxFrameHandler3, @unwind, @except
    pushq   %rbp
    .seh_pushreg 5
    pushq   %r14
    .seh_pushreg 14
    pushq   %rsi
    .seh_pushreg 6
    pushq   %rdi
    .seh_pushreg 7
    pushq   %rbx
    .seh_pushreg 3
    subq    $128, %rsp
    .seh_stackalloc 128
    leaq    128(%rsp), %rbp
    .seh_setframe 5, 128
    .seh_endprologue
    movq    $-2, -8(%rbp)
    leaq    -56(%rbp), %r8
    movl    $32768, %ecx
    movl    $1, %edx
    callq   __rust_alloc
    movq    %rax, %rdi
    testq   %rdi, %rdi
    je  .LBB6_21
    movl    $32768, %r14d
    movl    $1, %edx
    movl    $32768, %r8d
    movq    %rdi, %rcx
    callq   memset
    movq    %rdi, -56(%rbp)
    movaps  __xmm@00000000000080000000000000008000(%rip), %xmm0
    movups  %xmm0, -48(%rbp)
    xorl    %edx, %edx
    movl    $3, %eax
    movl    $3, %ecx
    cmpb    $0, (%rdi,%rdx)
    jne .LBB6_3
    jmp .LBB6_10
    .p2align    4, 0x90
.LBB6_12:
    addq    $2, %rax
    movq    -56(%rbp), %rdi
    cmpb    $0, (%rdi,%rdx)
    je  .LBB6_10
.LBB6_3:
    cmpq    %rcx, %r14
    jb  .LBB6_4
    cmpq    %rcx, %r14
    je  .LBB6_10
    addq    %rdi, %r14
    leaq    (%rdi,%rcx), %rdi
    leaq    -1(%rax), %rsi
    addq    $1, %rdi
    .p2align    4, 0x90
.LBB6_9:
    movb    $0, -1(%rdi)
    movq    %r14, %rbx
    subq    %rdi, %rbx
    addq    %rax, %rdi
    cmpq    %rsi, %rbx
    ja  .LBB6_9
.LBB6_10:
    addq    %rax, %rcx
    addq    %rax, %rcx
    addq    $2, %rcx
    cmpq    $32767, %rcx
    ja  .LBB6_14
    addq    $1, %rdx
    movq    -40(%rbp), %r14
    cmpq    %rdx, %r14
    ja  .LBB6_12
.Ltmp6:
    leaq    panic_bounds_check_loc.j(%rip), %rcx
    movq    %r14, %r8
    callq   _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
    jmp .LBB6_6
.LBB6_14:
    movq    -40(%rbp), %rax
    movq    %rax, -64(%rbp)
    movups  -56(%rbp), %xmm0
    movaps  %xmm0, -80(%rbp)
    cmpq    $0, -64(%rbp)
    je  .LBB6_15
    movq    -80(%rbp), %rsi
    movq    %rsi, -96(%rbp)
    leaq    _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
    movq    %rax, -88(%rbp)
    leaq    ref.m(%rip), %rax
    movq    %rax, -56(%rbp)
    movq    $2, -48(%rbp)
    leaq    ref.n(%rip), %rax
    movq    %rax, -40(%rbp)
    movq    $1, -32(%rbp)
    leaq    -96(%rbp), %rax
    movq    %rax, -24(%rbp)
    movq    $1, -16(%rbp)
.Ltmp2:
    leaq    -56(%rbp), %rcx
    callq   _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp3:
    movq    -72(%rbp), %rdx
    testq   %rdx, %rdx
    je  .LBB6_19
    movl    $1, %r8d
    movq    %rsi, %rcx
    callq   __rust_dealloc
.LBB6_19:
    nop
    addq    $128, %rsp
    popq    %rbx
    popq    %rdi
    popq    %rsi
    popq    %r14
    popq    %rbp
    retq
.LBB6_4:
.Ltmp0:
    movq    %r14, %rdx
    callq   _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp1:
.LBB6_6:
    ud2
.LBB6_21:
    movups  -48(%rbp), %xmm0
    movaps  %xmm0, -80(%rbp)
    movaps  -80(%rbp), %xmm0
    movups  %xmm0, -48(%rbp)
    leaq    -56(%rbp), %rcx
    callq   _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
    ud2
.LBB6_15:
.Ltmp4:
    leaq    panic_bounds_check_loc.j(%rip), %rcx
    xorl    %edx, %edx
    xorl    %r8d, %r8d
    callq   _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp5:
    jmp .LBB6_6
    .seh_handlerdata
    .long   ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
    .seh_endproc
    .def     "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
    .scl    3;
    .type   32;
    .endef
    .p2align    4, 0x90
"?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_13:
    movq    %rdx, 16(%rsp)
    pushq   %rbp
    .seh_pushreg 5
    pushq   %r14
    .seh_pushreg 14
    pushq   %rsi
    .seh_pushreg 6
    pushq   %rdi
    .seh_pushreg 7
    pushq   %rbx
    .seh_pushreg 3
    subq    $32, %rsp
    .seh_stackalloc 32
    leaq    128(%rdx), %rbp
    .seh_endprologue
    leaq    -56(%rbp), %rcx
    callq   _ZN4core3ptr13drop_in_place17h98ac405189abf599E
    nop
    addq    $32, %rsp
    popq    %rbx
    popq    %rdi
    popq    %rsi
    popq    %r14
    popq    %rbp
    retq
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
    .seh_endproc
    .def     "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
    .scl    3;
    .type   32;
    .endef
    .p2align    4, 0x90
"?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_20:
    movq    %rdx, 16(%rsp)
    pushq   %rbp
    .seh_pushreg 5
    pushq   %r14
    .seh_pushreg 14
    pushq   %rsi
    .seh_pushreg 6
    pushq   %rdi
    .seh_pushreg 7
    pushq   %rbx
    .seh_pushreg 3
    subq    $32, %rsp
    .seh_stackalloc 32
    leaq    128(%rdx), %rbp
    .seh_endprologue
    leaq    -80(%rbp), %rcx
    callq   _ZN4core3ptr13drop_in_place17h98ac405189abf599E
    nop
    addq    $32, %rsp
    popq    %rbx
    popq    %rdi
    popq    %rsi
    popq    %r14
    popq    %rbp
    retq
.Lfunc_end0:
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
    .seh_endproc
    .section    .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
    .p2align    2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
    .long   429065506
    .long   2
    .long   ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
    .long   0
    .long   0
    .long   6
    .long   ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
    .long   120
    .long   0
    .long   1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
    .long   -1
    .long   "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
    .long   -1
    .long   "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
    .long   .Lfunc_begin0@IMGREL
    .long   -1
    .long   .Ltmp6@IMGREL+1
    .long   0
    .long   .Ltmp2@IMGREL+1
    .long   1
    .long   .Ltmp0@IMGREL+1
    .long   0
    .long   .Ltmp4@IMGREL+1
    .long   1
    .long   .Ltmp5@IMGREL+1
    .long   -1
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E

    .def     main;
    .scl    2;
    .type   32;
    .endef
    .section    .text,"xr",one_only,main
    .globl  main
    .p2align    4, 0x90
main:
.seh_proc main
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    movq    %rdx, %rax
    movslq  %ecx, %r8
    leaq    _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
    movq    %rcx, 32(%rsp)
    leaq    vtable.4(%rip), %rdx
    leaq    32(%rsp), %rcx
    movq    %rax, %r9
    callq   _ZN3std2rt19lang_start_internal17h273003faf754a099E
    nop
    addq    $40, %rsp
    retq
    .seh_handlerdata
    .section    .text,"xr",one_only,main
    .seh_endproc

    .section    .rdata,"dr",one_only,vtable.4
    .p2align    3
vtable.4:
    .quad   _ZN4core3ptr13drop_in_place17hd909dec568d984beE
    .quad   8
    .quad   8
    .quad   _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    .quad   _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    .quad   _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE

    .section    .rdata,"dr",one_only,str.i
    .p2align    4
str.i:
    .ascii  "C:\\projects\\rust\\src\\liballoc\\vec.rs"

    .section    .rdata,"dr",one_only,panic_bounds_check_loc.j
    .p2align    3
panic_bounds_check_loc.j:
    .quad   str.i
    .quad   36
    .long   1551
    .long   10

    .section    .rdata,"dr",one_only,str.k
str.k:

    .section    .rdata,"dr",one_only,str.l
str.l:
    .byte   10

    .section    .rdata,"dr",one_only,ref.m
    .p2align    3
ref.m:
    .quad   str.k
    .quad   0
    .quad   str.l
    .quad   1

    .section    .rdata,"dr",one_only,ref.n
    .p2align    3
ref.n:
    .quad   1
    .quad   0
    .quad   3
    .zero   8
    .quad   3
    .zero   8
    .long   32
    .long   0
    .byte   3
    .zero   7

为sift2_u16生成的程序集

U16
    .text
    .def     _ZN3std2rt10lang_start17h0092a1d276f89f87E;
    .scl    2;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
    .globl  _ZN3std2rt10lang_start17h0092a1d276f89f87E
    .p2align    4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    movq    %r8, %r9
    movq    %rdx, %rax
    movq    %rcx, 32(%rsp)
    leaq    vtable.4(%rip), %rdx
    leaq    32(%rsp), %rcx
    movq    %rax, %r8
    callq   _ZN3std2rt19lang_start_internal17h273003faf754a099E
    nop
    addq    $40, %rsp
    retq
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
    .seh_endproc

    .def     _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    .p2align    4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    callq   *(%rcx)
    nop
    addq    $40, %rsp
    jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    .seh_endproc

    .def     _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
    .p2align    4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    callq   *%rcx
    nop
    addq    $40, %rsp
    jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
    .seh_endproc

    .def     _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
    .p2align    4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
    movq    8(%rcx), %rdx
    testq   %rdx, %rdx
    je  .LBB3_1
    movq    (%rcx), %rcx
    movl    $1, %r8d
    jmp __rust_dealloc
.LBB3_1:
    retq

    .def     _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
    .p2align    4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
    retq

    .def     _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
    .scl    3;
    .type   32;
    .endef
    .section    .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
    .p2align    4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    callq   __rust_oom
    ud2
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
    .seh_endproc

    .def     _ZN8chapter34main17hfb06448c1bac2398E;
    .scl    3;
    .type   32;
    .endef
    .globl  __xmm@00000000000080000000000000008000
    .section    .rdata,"dr",discard,__xmm@00000000000080000000000000008000
    .p2align    4
__xmm@00000000000080000000000000008000:
    .quad   32768
    .quad   32768
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
    .p2align    4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
    .seh_handler __CxxFrameHandler3, @unwind, @except
    pushq   %rbp
    .seh_pushreg 5
    pushq   %r14
    .seh_pushreg 14
    pushq   %rsi
    .seh_pushreg 6
    pushq   %rdi
    .seh_pushreg 7
    pushq   %rbx
    .seh_pushreg 3
    subq    $128, %rsp
    .seh_stackalloc 128
    leaq    128(%rsp), %rbp
    .seh_setframe 5, 128
    .seh_endprologue
    movq    $-2, -8(%rbp)
    leaq    -56(%rbp), %r8
    movl    $32768, %ecx
    movl    $1, %edx
    callq   __rust_alloc
    movq    %rax, %r14
    testq   %r14, %r14
    je  .LBB6_23
    movl    $32768, %edi
    movl    $1, %edx
    movl    $32768, %r8d
    movq    %r14, %rcx
    callq   memset
    movq    %r14, -56(%rbp)
    movaps  __xmm@00000000000080000000000000008000(%rip), %xmm0
    movups  %xmm0, -48(%rbp)
    movw    $3, %r8w
    xorl    %edx, %edx
    movw    $3, %r9w
    cmpb    $0, (%r14,%rdx)
    jne .LBB6_3
    jmp .LBB6_12
    .p2align    4, 0x90
.LBB6_14:
    movq    -56(%rbp), %r14
    cmpb    $0, (%r14,%rdx)
    je  .LBB6_12
.LBB6_3:
    movzwl  %r9w, %ecx
    cmpq    %rcx, %rdi
    jb  .LBB6_4
    testw   %r8w, %r8w
    je  .LBB6_8
    cmpq    %rcx, %rdi
    je  .LBB6_12
    addq    %r14, %rcx
    movzwl  %r8w, %ebx
    addq    %r14, %rdi
    leaq    -1(%rbx), %rax
    addq    $1, %rcx
    .p2align    4, 0x90
.LBB6_11:
    movb    $0, -1(%rcx)
    movq    %rdi, %rsi
    subq    %rcx, %rsi
    addq    %rbx, %rcx
    cmpq    %rax, %rsi
    ja  .LBB6_11
.LBB6_12:
    addl    %r8d, %r9d
    addl    $2, %r8d
    addw    %r8w, %r9w
    js  .LBB6_16
    addq    $1, %rdx
    movq    -40(%rbp), %rdi
    cmpq    %rdx, %rdi
    ja  .LBB6_14
.Ltmp8:
    leaq    panic_bounds_check_loc.j(%rip), %rcx
    movq    %rdi, %r8
    callq   _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp9:
    jmp .LBB6_6
.LBB6_16:
    movq    -40(%rbp), %rax
    movq    %rax, -64(%rbp)
    movups  -56(%rbp), %xmm0
    movaps  %xmm0, -80(%rbp)
    cmpq    $0, -64(%rbp)
    je  .LBB6_17
    movq    -80(%rbp), %rsi
    movq    %rsi, -96(%rbp)
    leaq    _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
    movq    %rax, -88(%rbp)
    leaq    ref.m(%rip), %rax
    movq    %rax, -56(%rbp)
    movq    $2, -48(%rbp)
    leaq    ref.n(%rip), %rax
    movq    %rax, -40(%rbp)
    movq    $1, -32(%rbp)
    leaq    -96(%rbp), %rax
    movq    %rax, -24(%rbp)
    movq    $1, -16(%rbp)
.Ltmp4:
    leaq    -56(%rbp), %rcx
    callq   _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp5:
    movq    -72(%rbp), %rdx
    testq   %rdx, %rdx
    je  .LBB6_21
    movl    $1, %r8d
    movq    %rsi, %rcx
    callq   __rust_dealloc
.LBB6_21:
    nop
    addq    $128, %rsp
    popq    %rbx
    popq    %rdi
    popq    %rsi
    popq    %r14
    popq    %rbp
    retq
.LBB6_4:
.Ltmp2:
    movq    %rdi, %rdx
    callq   _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp3:
    jmp .LBB6_6
.LBB6_8:
.Ltmp0:
    leaq    ref.b(%rip), %rcx
    callq   _ZN4core9panicking5panic17h42feaa2e0dc2c607E
.Ltmp1:
.LBB6_6:
    ud2
.LBB6_23:
    movups  -48(%rbp), %xmm0
    movaps  %xmm0, -80(%rbp)
    movaps  -80(%rbp), %xmm0
    movups  %xmm0, -48(%rbp)
    leaq    -56(%rbp), %rcx
    callq   _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
    ud2
.LBB6_17:
.Ltmp6:
    leaq    panic_bounds_check_loc.j(%rip), %rcx
    xorl    %edx, %edx
    xorl    %r8d, %r8d
    callq   _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
    jmp .LBB6_6
    .seh_handlerdata
    .long   ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
    .seh_endproc
    .def     "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
    .scl    3;
    .type   32;
    .endef
    .p2align    4, 0x90
"?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_15:
    movq    %rdx, 16(%rsp)
    pushq   %rbp
    .seh_pushreg 5
    pushq   %r14
    .seh_pushreg 14
    pushq   %rsi
    .seh_pushreg 6
    pushq   %rdi
    .seh_pushreg 7
    pushq   %rbx
    .seh_pushreg 3
    subq    $32, %rsp
    .seh_stackalloc 32
    leaq    128(%rdx), %rbp
    .seh_endprologue
    leaq    -56(%rbp), %rcx
    callq   _ZN4core3ptr13drop_in_place17h98ac405189abf599E
    nop
    addq    $32, %rsp
    popq    %rbx
    popq    %rdi
    popq    %rsi
    popq    %r14
    popq    %rbp
    retq
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
    .seh_endproc
    .def     "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
    .scl    3;
    .type   32;
    .endef
    .p2align    4, 0x90
"?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_22:
    movq    %rdx, 16(%rsp)
    pushq   %rbp
    .seh_pushreg 5
    pushq   %r14
    .seh_pushreg 14
    pushq   %rsi
    .seh_pushreg 6
    pushq   %rdi
    .seh_pushreg 7
    pushq   %rbx
    .seh_pushreg 3
    subq    $32, %rsp
    .seh_stackalloc 32
    leaq    128(%rdx), %rbp
    .seh_endprologue
    leaq    -80(%rbp), %rcx
    callq   _ZN4core3ptr13drop_in_place17h98ac405189abf599E
    nop
    addq    $32, %rsp
    popq    %rbx
    popq    %rdi
    popq    %rsi
    popq    %r14
    popq    %rbp
    retq
.Lfunc_end0:
    .seh_handlerdata
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
    .seh_endproc
    .section    .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
    .p2align    2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
    .long   429065506
    .long   2
    .long   ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
    .long   0
    .long   0
    .long   6
    .long   ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
    .long   120
    .long   0
    .long   1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
    .long   -1
    .long   "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
    .long   -1
    .long   "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
    .long   .Lfunc_begin0@IMGREL
    .long   -1
    .long   .Ltmp8@IMGREL+1
    .long   0
    .long   .Ltmp4@IMGREL+1
    .long   1
    .long   .Ltmp2@IMGREL+1
    .long   0
    .long   .Ltmp6@IMGREL+1
    .long   1
    .long   .Ltmp7@IMGREL+1
    .long   -1
    .section    .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E

    .def     main;
    .scl    2;
    .type   32;
    .endef
    .section    .text,"xr",one_only,main
    .globl  main
    .p2align    4, 0x90
main:
.seh_proc main
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    movq    %rdx, %rax
    movslq  %ecx, %r8
    leaq    _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
    movq    %rcx, 32(%rsp)
    leaq    vtable.4(%rip), %rdx
    leaq    32(%rsp), %rcx
    movq    %rax, %r9
    callq   _ZN3std2rt19lang_start_internal17h273003faf754a099E
    nop
    addq    $40, %rsp
    retq
    .seh_handlerdata
    .section    .text,"xr",one_only,main
    .seh_endproc

    .section    .rdata,"dr",one_only,vtable.4
    .p2align    3
vtable.4:
    .quad   _ZN4core3ptr13drop_in_place17hd909dec568d984beE
    .quad   8
    .quad   8
    .quad   _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    .quad   _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
    .quad   _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE

    .section    .rdata,"dr",one_only,str.9
    .p2align    4
str.9:
    .ascii  "assertion failed: step != 0"

    .section    .rdata,"dr",one_only,str.a
    .p2align    4
str.a:
    .ascii  "libcore\\iter\\iterator.rs"

    .section    .rdata,"dr",one_only,ref.b
    .p2align    3
ref.b:
    .quad   str.9
    .quad   27
    .quad   str.a
    .quad   24
    .long   299
    .long   9

    .section    .rdata,"dr",one_only,str.i
    .p2align    4
str.i:
    .ascii  "C:\\projects\\rust\\src\\liballoc\\vec.rs"

    .section    .rdata,"dr",one_only,panic_bounds_check_loc.j
    .p2align    3
panic_bounds_check_loc.j:
    .quad   str.i
    .quad   36
    .long   1551
    .long   10

    .section    .rdata,"dr",one_only,str.k
str.k:

    .section    .rdata,"dr",one_only,str.l
str.l:
    .byte   10

    .section    .rdata,"dr",one_only,ref.m
    .p2align    3
ref.m:
    .quad   str.k
    .quad   0
    .quad   str.l
    .quad   1

    .section    .rdata,"dr",one_only,ref.n
    .p2align    3
ref.n:
    .quad   1
    .quad   0
    .quad   3
    .zero   8
    .quad   3
    .zero   8
    .long   32
    .long   0
    .byte   3
    .zero   7

1 个答案:

答案 0 :(得分:3)

我从未尝试过Rust,但我知道一些很好的工具可以进行这样的性能分析。因此,虽然它可能无法完全回答您的问题,但您将获得更多信息。

在尝试理解低级别性能时,必须查看生成的程序集,您似乎已通过所提供的输出完成该程序集。但是,这是非常难以理解的。这就是我的偏好工具进入游戏的地方:Compiler Explorer。您可以看到your code here

从生成的程序集中,我们看到了一些差异。让我们只关注循环的两个元素(你可以检查其他元素,但想法和结果是一样的)

测试为32位:

mov     rax, qword ptr [rbp - 112]
cmp     qword ptr [rbp - 64], rax
jb      .LBB124_5

16位相同:

mov     ax, word ptr [rbp - 98]
cmp     word ptr [rbp - 52], ax
jb      .LBB125_5
32中的

mark_sieve

.LBB124_8:
    mov     rax, qword ptr [rbp - 64]
    mov     qword ptr [rbp - 48], rax
    mov     rsi, qword ptr [rbp - 48]
    lea     rdi, [rbp - 96]
    call    <alloc::vec::Vec<T> as core::ops::index::IndexMut<core::ops::range::RangeFrom<usize>>>::index_mut
    mov     qword ptr [rbp - 136], rdx
    mov     qword ptr [rbp - 144], rax
    jmp     .LBB124_9
.LBB124_9:
    mov     rdx, qword ptr [rbp - 56]
    mov     rdi, qword ptr [rbp - 144]
    mov     rsi, qword ptr [rbp - 136]
    call    example::mark_sieve
    jmp     .LBB124_10

并且在16:

.LBB125_8:
    movzx   eax, word ptr [rbp - 52]
    mov     ecx, eax
    mov     qword ptr [rbp - 48], rcx
    mov     rsi, qword ptr [rbp - 48]
    lea     rdi, [rbp - 80]
    call    <alloc::vec::Vec<T> as core::ops::index::IndexMut<core::ops::range::RangeFrom<usize>>>::index_mut
    mov     qword ptr [rbp - 120], rdx
    mov     qword ptr [rbp - 128], rax
    jmp     .LBB125_9
.LBB125_9:
    movzx   eax, word ptr [rbp - 50]
    mov     edx, eax
    mov     rdi, qword ptr [rbp - 128]
    mov     rsi, qword ptr [rbp - 120]
    call    example::mark_sieve
    jmp     .LBB125_10

我们可以在此代码中看到一些差异:

  • u16代码可以使用16位(ax,cx,dc)或32位(eax,ecx,edx)寄存器,而u32代码仅使用64位(rax,rcd,rdx)。
  • u16代码使用 movzx 代替 mov 进行指针deref。
  • u16读取16位或内存(字),而u32读取64位(qword)。

对于这些不同的说明,您可以看到它们的相对执行时间差异,看着优秀的Agner Instruction Tables。 (我很乐意看到它们集成在编译器资源管理器btw中)。 你的CPU似乎是一个KabyLake(所以我们将使用Skylake架构)所以我们将从第231页开始获取表格(阅读此页面以获取表格中使用的缩写)。

从Agner表中, mov r64,m movzx r,m 之间的uops数相同(1 uop),但mov加2延迟周期。

不同的累加器也会改变处理器的一些优化。

编译器还执行其他体系结构相关的优化,例如循环展开,具体取决于CPU上可用的ALU数量。因此,根据编译器的决定,您的代码在不同的CPU之间可能会有不同的行为。

差异也可能是由于代码对齐或缓存优化造成的。

关于电源管理差异,这可能会受到两个因素的影响:频次上限和C状态管理。 cstates是cpu在短时间内进入不同睡眠状态的状态。睡眠/唤醒对cpu内部的影响取决于固件。因此,我们无法真正详细检查(分析也会改变结果)。

我做了a post about understanding Meltdown/Spectre attack,它解释了CPU可以在引擎盖下做的不同优化(甚至组件都无法反映)。您可能还想看看它以更好地理解为什么CPU优化很难,因为我们无法控制很多参数。

快乐的黑客攻击!