为什么通过替换默认值来填充Vec比填充预设容量的东西快得多?

时间:2018-03-27 17:56:29

标签: vector rust

前言:我一般不是优化者。

大多数情况下,在解决Rust中的编码难题时,我使用@foreach($FriendRequests as $uList) <div class="col-sm-12 col-md-12"> <div class="card"> <div class="card-body"> <a href="#"> <img src="{{url('/')}}/public/img/{{$uList->pic}}" class="userpic" /> </a> <a href="{{url('/profile')}}/{{$uList->slug }}"><span style="text-align:left;font-size:25px;margin-left:10px;" >{{ucwords($uList->name)}}</span></a> @if (session()->has('msg')) <p style="float:right;">{{session()->get('msg')}}</p> @else <span style="text-align:center;float:right;color:white;"><a href="{{url('/accept/')}}/{{$uList->id}}" class="btn btn-success" style="margin-bottom:10px;">Accept</a>&nbsp;<a href="#" class="btn btn-danger" style="margin-bottom:10px;">Decline</a></span> @endif </div> </div> </div> @endforeach 来初始化我的向量,然后通过Vec::with_capacity将项目插入到向量中。在大多数情况下,这很好,但我最近遇到了一个需要更快程序的谜题,这激发了我重新思考我的方法。

因为我知道向量的容量正好是一些数字,所以我决定比较我常用的pushwith_capacity方法的结果,创建一个满0的向量和替换他们。这是我用来对两个操作进行基准测试的代码:

push

令我惊讶的是,当我运行#![feature(test)] extern crate test; #[cfg(test)] mod tests { use test::Bencher; // Create a vector with a capacity of 10,000 u16s // and populate it by pushing. #[bench] fn push_fill(b: &mut Bencher) { b.iter(|| { let mut v: Vec<u16> = Vec::with_capacity(10000); for i in 0..10000 as u16 { v.push(i); } }) } // Create a vector of 10,000 u16s, initialize them // to 0, and then replace them to populate the vector. #[bench] fn replace_fill(b: &mut Bencher) { b.iter(|| { let mut v: Vec<u16> = vec![0u16; 10000]; for i in 0..10000 { v[i] = i as u16; } }) } } 时,替换解决方案比with_capacity解决方案好一个数量级。

cargo +nightly bench

我对时间上的差异感到惊讶,特别是考虑到我预计 Compiling benchmarks v0.1.0 (file:///C:/Users/CEUser/Documents/Programs/rustprojects/benchmarks) Finished release [optimized] target(s) in 10.75 secs Running target\release\deps\benchmarks-0b553bf1dfb7e9a4.exe running 2 tests test tests::push_fill ... bench: 26,756 ns/iter (+/- 4,046) test tests::replace_fill ... bench: 1,902 ns/iter (+/- 802) test result: ok. 0 passed; 0 failed; 0 ignored; 2 measured; 0 filtered out 版本需要更长时间(因为它必须创建一个充满填充的向量,然后然后替换填充数据和实际数据。)

是否有一个直观的原因replacereplace_fill快得多?这两个函数正在做什么之间的区别是什么?

1 个答案:

答案 0 :(得分:2)

如有疑问,请检查组件!

您可以使用godbolt或游乐场;虽然我更喜欢godbolt,因为它使用突出显示将装配部分与源代码相匹配,使其更容易探索。

在上面的链接中,replace_fill功能已优化为:

example::replace_fill:
  push rbp
  mov rbp, rsp
  sub rsp, 48
  lea rdx, [rbp - 24]
  mov edi, 20000
  mov esi, 2
  call __rust_alloc_zeroed@PLT
  test rax, rax
  je .LBB3_4
  movdqa xmm0, xmmword ptr [rip + .LCPI3_0]
  mov ecx, 32
  movdqa xmm1, xmmword ptr [rip + .LCPI3_1]
  movdqa xmm2, xmmword ptr [rip + .LCPI3_2]
  movdqa xmm3, xmmword ptr [rip + .LCPI3_3]
  movdqa xmm4, xmmword ptr [rip + .LCPI3_4]
  movdqa xmm5, xmmword ptr [rip + .LCPI3_5]
.LBB3_2:
  movdqu xmmword ptr [rax + 2*rcx - 64], xmm0
  movdqa xmm6, xmm0
  paddw xmm6, xmm1
  movdqu xmmword ptr [rax + 2*rcx - 48], xmm6
  movdqa xmm6, xmm0
  paddw xmm6, xmm2
  movdqu xmmword ptr [rax + 2*rcx - 32], xmm6
  movdqa xmm6, xmm0
  paddw xmm6, xmm3
  movdqu xmmword ptr [rax + 2*rcx - 16], xmm6
  movdqa xmm6, xmm0
  paddw xmm6, xmm4
  movdqu xmmword ptr [rax + 2*rcx], xmm6
  paddw xmm0, xmm5
  add rcx, 40
  cmp rcx, 10032
  jne .LBB3_2
  mov esi, 20000
  mov edx, 2
  mov rdi, rax
  call __rust_dealloc@PLT
  add rsp, 48
  pop rbp
  ret
.LBB3_4:
  mov rax, qword ptr [rbp - 24]
  movups xmm0, xmmword ptr [rbp - 16]
  movaps xmmword ptr [rbp - 48], xmm0
  mov qword ptr [rbp - 24], rax
  movaps xmm0, xmmword ptr [rbp - 48]
  movups xmmword ptr [rbp - 16], xmm0
  lea rdi, [rbp - 24]
  call __rust_oom@PLT
  ud2

后一部分(LBB3_4)是OOM处理,因此从未使用过。因此,执行流程如下:

  • example::replace_fill,执行分配+初始设置,
  • .LBB3_2这就是循环。

有两个要素:

  • 根本没有Vec代码,
  • 这些是矢量说明。

另一方面,push_fill有点复杂:

example::push_fill:
  push rbp
  mov rbp, rsp
  push r15
  push r14
  push rbx
  sub rsp, 40
  lea rdx, [rbp - 48]
  mov edi, 20000
  mov esi, 2
  call __rust_alloc@PLT
  mov rcx, rax
  test rcx, rcx
  je .LBB2_11
  mov qword ptr [rbp - 48], rcx
  mov qword ptr [rbp - 40], 10000
  mov qword ptr [rbp - 32], 0
  xor r15d, r15d
  lea r14, [rbp - 48]
  xor esi, esi
.LBB2_2:
  mov ebx, r15d
  add bx, 1
  cmovb bx, r15w
  jb .LBB2_3
  cmp rsi, qword ptr [rbp - 40]
  jne .LBB2_9
  mov rdi, r14
  call <alloc::raw_vec::RawVec<T, A>>::double
  mov rcx, qword ptr [rbp - 48]
  mov rsi, qword ptr [rbp - 32]
.LBB2_9:
  mov word ptr [rcx + 2*rsi], r15w
  mov rsi, qword ptr [rbp - 32]
  inc rsi
  mov qword ptr [rbp - 32], rsi
  movzx eax, bx
  cmp eax, 10000
  mov r15w, bx
  jb .LBB2_2
.LBB2_3:
  mov rsi, qword ptr [rbp - 40]
  test rsi, rsi
  je .LBB2_5
  add rsi, rsi
  mov rdi, qword ptr [rbp - 48]
  mov edx, 2
  call __rust_dealloc@PLT
.LBB2_5:
  add rsp, 40
  pop rbx
  pop r14
  pop r15
  pop rbp
  ret
.LBB2_11:
  movups xmm0, xmmword ptr [rbp - 40]
  movaps xmmword ptr [rbp - 64], xmm0
  movaps xmm0, xmmword ptr [rbp - 64]
  movups xmmword ptr [rbp - 40], xmm0
  lea rdi, [rbp - 48]
  call __rust_oom@PLT
  ud2
  mov rbx, rax
  lea rdi, [rbp - 48]
  call core::ptr::drop_in_place
  mov rdi, rbx
  call _Unwind_Resume@PLT
  ud2

更多的块,意味着更多的分支,在循环的每次迭代中检查容量不足,...

但上述例子都不是惯用语。

以下是我写这些内容的方法:

#[inline(never)]
pub fn extend_fill() {
    let mut v = Vec::new();
    v.extend(0u16..10000);
}

此方法来自实施Extend trait。当与信任长度迭代器(如此)一起使用时,它将执行单个&#34;增长&#34;必要时步骤,然后再推,无需再次检查。

装配不像replace_fill那么精简,但看起来还不错:

example::extend_fill:
  push rbp
  mov rbp, rsp
  sub rsp, 64
  mov qword ptr [rbp - 24], 2
  xorps xmm0, xmm0
  movups xmmword ptr [rbp - 16], xmm0
  lea rdx, [rbp - 48]
  mov edi, 20000
  mov esi, 2
  call __rust_alloc@PLT
  test rax, rax
  je .LBB4_7
  mov qword ptr [rbp - 24], rax
  mov qword ptr [rbp - 16], 10000
  xor ecx, ecx
  movdqa xmm0, xmmword ptr [rip + .LCPI4_0]
  movdqa xmm1, xmmword ptr [rip + .LCPI4_1]
  jmp .LBB4_2
.LBB4_6:
  movd xmm2, edx
  pshuflw xmm2, xmm2, 0
  pshufd xmm2, xmm2, 80
  movdqa xmm3, xmm2
  paddw xmm3, xmm0
  paddw xmm2, xmm1
  movdqu xmmword ptr [rax + 2*rcx + 32], xmm3
  movdqu xmmword ptr [rax + 2*rcx + 48], xmm2
  add rdx, 16
  mov rcx, rdx
.LBB4_2:
  movd xmm2, ecx
  pshuflw xmm2, xmm2, 0
  pshufd xmm2, xmm2, 80
  movdqa xmm3, xmm2
  paddw xmm3, xmm0
  paddw xmm2, xmm1
  movdqu xmmword ptr [rax + 2*rcx], xmm3
  movdqu xmmword ptr [rax + 2*rcx + 16], xmm2
  lea rdx, [rcx + 16]
  cmp rdx, 10000
  jne .LBB4_6
  mov qword ptr [rbp - 8], 10000
  mov rsi, qword ptr [rbp - 16]
  test rsi, rsi
  je .LBB4_5
  add rsi, rsi
  mov rdi, qword ptr [rbp - 24]
  mov edx, 2
  call __rust_dealloc@PLT
.LBB4_5:
  add rsp, 64
  pop rbp
  ret
.LBB4_7:
  movups xmm0, xmmword ptr [rbp - 40]
  movaps xmmword ptr [rbp - 64], xmm0
  movaps xmm0, xmmword ptr [rbp - 64]
  movups xmmword ptr [rbp - 40], xmm0
  lea rdi, [rbp - 48]
  call __rust_oom@PLT
  ud2

我鼓励你尝试一下,一般熟悉Rust迭代器:甜蜜的代码,良好的性能,它们是你需要的工具。