从一个数组中减去另一个数组的最佳方法

时间:2011-02-15 19:58:17

标签: performance delphi x86 sse

我有以下代码,这是我的应用程序的一部分的瓶颈。我所做的只是从另一个中减去Array。这两个阵列都有大约100000个元素。我正试图找到一种方法来提高性能。

var
  Array1, Array2 : array of integer;

..... 
// Code that fills the arrays
.....

for ix := 0 to length(array1)-1
  Array1[ix] := Array1[ix] - Array2[ix];

end;

有人有建议吗?

5 个答案:

答案 0 :(得分:9)

在多个线程上运行它,使用那个大的数组将净线性加速。正如他们所说,这是令人尴尬的平行。

答案 1 :(得分:9)

在更多线程上运行减法听起来不错,但100K整数sunstraction不占用大量CPU时间,所以也许是线程池...但是设置线程也有很多开销,所以短阵列的并行性能会降低线程比只有一个(主)线程!

您是否关闭了编译器设置,溢出和范围检查?

你可以尝试使用asm rutine,它非常简单......

类似的东西:

procedure SubArray(var ar1, ar2; length: integer);
asm
//length must be > than 0!
   push ebx
   lea  ar1, ar1 -4
   lea  ar2, ar2 -4
@Loop:
   mov  ebx, [ar2 + length *4]
   sub  [ar1 + length *4], ebx
   dec  length
//Here you can put more folloving parts of rutine to more unrole it to speed up.
   jz   @exit
   mov  ebx, [ar2 + length *4]
   sub  [ar1 + length *4], ebx
   dec  length
//
   jnz  @Loop
@exit:
   pop  ebx
end;


begin
   SubArray(Array1[0], Array2[0], length(Array1));

它可以更快......

编辑:使用SIMD说明添加了程序。 此过程请求SSE CPU支持。它可以在XMM寄存器中取4个整数并一次减去。也可以使用movdqa代替movdqu它更快,但您必须首先确保16字节对齐。您也可以像我的第一个asm案例一样解除XMM标准。 (我对速度测量感兴趣。:))

var
  array1, array2: array of integer;

procedure SubQIntArray(var ar1, ar2; length: integer);
asm
//prepare length if not rounded to 4
  push     ecx
  shr      length, 2
  jz       @LengthToSmall
@Loop:
  movdqu   xmm1, [ar1]          //or movdqa but ensure 16b aligment first
  movdqu   xmm2, [ar2]          //or movdqa but ensure 16b aligment first
  psubd    xmm1, xmm2
  movdqu   [ar1], xmm1          //or movdqa but ensure 16b aligment first
  add      ar1, 16
  add      ar2, 16
  dec      length
  jnz      @Loop
@LengthToSmall:
  pop      ecx
  push     ebx
  and      ecx, 3
  jz       @Exit
  mov      ebx, [ar2]
  sub      [ar1], ebx
  dec      ecx
  jz       @Exit
  mov      ebx, [ar2 + 4]
  sub      [ar1 + 4], ebx
  dec      ecx
  jz       @Exit
  mov      ebx, [ar2 + 8]
  sub      [ar1 + 8], ebx
@Exit:
  pop      ebx
end;

begin
//Fill arrays first!
  SubQIntArray(Array1[0], Array2[0], length(Array1));

答案 2 :(得分:5)

在这个简单的案例中,我对速度优化非常好奇。 所以我制作了6个简单的程序,测量数据块大小为100000的CPU时钟和时间;

  1. 使用编译器选项Range和Overflow Checking On
  2. 的Pascal过程
  3. 使用编译器选项Range和Overflow Check off的Pascal过程
  4. Classic x86汇编程序。
  5. 使用SSE指令和未对齐的16字节移动
  6. 的汇编程序。
  7. 使用SSE指令和对齐16字节移动
  8. 的汇编程序。
  9. 汇编程序8次展开循环,带有SSE指令,对齐16字节移动
  10. 检查图片和代码的结果以获取更多信息。 enter image description here

    要获得16字节的内存对齐,请首先在文件'FastMM4Options.inc'指令中添加点{$ .define Align16Bytes} !

    program SubTest;
    
    {$APPTYPE CONSOLE}
    
    uses
    //In file 'FastMM4Options.inc' delite the dot in directive {$.define Align16Bytes}
    //to get 16 byte memory alignment!
      FastMM4,
      windows,
      SysUtils;
    
    var
      Ar1                   :array of integer;
      Ar2                   :array of integer;
      ArLength              :integer;
      StartTicks            :int64;
      EndTicks              :int64;
      TicksPerMicroSecond   :int64;
    
    function GetCpuTicks: int64;
    asm
      rdtsc
    end;
    {$R+}
    {$Q+}
    procedure SubArPasRangeOvfChkOn(length: integer);
    var
      n: integer;
    begin
      for n := 0 to length -1 do
        Ar1[n] := Ar1[n] - Ar2[n];
    end;
    {$R-}
    {$Q-}
    procedure SubArPas(length: integer);
    var
      n: integer;
    begin
      for n := 0 to length -1 do
        Ar1[n] := Ar1[n] - Ar2[n];
    end;
    
    procedure SubArAsm(var ar1, ar2; length: integer);
    asm
    //Length must be > than 0!
       push ebx
       lea  ar1, ar1 - 4
       lea  ar2, ar2 - 4
    @Loop:
       mov  ebx, [ar2 + length * 4]
       sub  [ar1 + length * 4], ebx
       dec  length
       jnz  @Loop
    @exit:
       pop  ebx
    end;
    
    procedure SubArAsmSimdU(var ar1, ar2; length: integer);
    asm
    //Prepare length
      push     length
      shr      length, 2
      jz       @Finish
    @Loop:
      movdqu   xmm1, [ar1]
      movdqu   xmm2, [ar2]
      psubd    xmm1, xmm2
      movdqu   [ar1], xmm1
      add      ar1, 16
      add      ar2, 16
      dec      length
      jnz      @Loop
    @Finish:
      pop      length
      push     ebx
      and      length, 3
      jz       @Exit
    //Do rest, up to 3 subtractions...
      mov      ebx, [ar2]
      sub      [ar1], ebx
      dec      length
      jz       @Exit
      mov      ebx, [ar2 + 4]
      sub      [ar1 + 4], ebx
      dec      length
      jz       @Exit
      mov      ebx, [ar2 + 8]
      sub      [ar1 + 8], ebx
    @Exit:
      pop      ebx
    end;
    
    procedure SubArAsmSimdA(var ar1, ar2; length: integer);
    asm
      push     ebx
    //Unfortunately delphi use first 8 bytes for dinamic array length and reference
    //counter, from that reason the dinamic array address should start with $xxxxxxx8
    //instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
      mov      ebx, [ar2]
      sub      [ar1], ebx
      dec      length
      jz       @exit
      mov      ebx, [ar2 + 4]
      sub      [ar1 + 4], ebx
      dec      length
      jz       @exit
      add      ar1, 8
      add      ar2, 8
    //Prepare length for 16 byte data transfer
      push     length
      shr      length, 2
      jz       @Finish
    @Loop:
      movdqa   xmm1, [ar1]
      movdqa   xmm2, [ar2]
      psubd    xmm1, xmm2
      movdqa   [ar1], xmm1
      add      ar1, 16
      add      ar2, 16
      dec      length
      jnz      @Loop
    @Finish:
      pop      length
      and      length, 3
      jz       @Exit
    //Do rest, up to 3 subtractions...
      mov      ebx, [ar2]
      sub      [ar1], ebx
      dec      length
      jz       @Exit
      mov      ebx, [ar2 + 4]
      sub      [ar1 + 4], ebx
      dec      length
      jz       @Exit
      mov      ebx, [ar2 + 8]
      sub      [ar1 + 8], ebx
    @Exit:
      pop      ebx
    end;
    
    procedure SubArAsmSimdAUnrolled8(var ar1, ar2; length: integer);
    asm
      push     ebx
    //Unfortunately delphi use first 8 bytes for dinamic array length and reference
    //counter, from that reason the dinamic array address should start with $xxxxxxx8
    //instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
      mov      ebx, [ar2]
      sub      [ar1], ebx
      dec      length
      jz       @exit
      mov      ebx, [ar2 + 4]
      sub      [ar1 + 4], ebx
      dec      length
      jz       @exit
      add      ar1, 8                       //Align pointer to 16 byte
      add      ar2, 8                       //Align pointer to 16 byte
    //Prepare length for 16 byte data transfer
      push     length
      shr      length, 5                    //8 * 4 subtructions per loop
      jz       @Finish                      //To small for LoopUnrolled
    @LoopUnrolled:
    //Unrolle 1, 2, 3, 4
      movdqa   xmm4, [ar2]
      movdqa   xmm5, [16 + ar2]
      movdqa   xmm6, [32 + ar2]
      movdqa   xmm7, [48 + ar2]
    //
      movdqa   xmm0, [ar1]
      movdqa   xmm1, [16 + ar1]
      movdqa   xmm2, [32 + ar1]
      movdqa   xmm3, [48 + ar1]
    //
      psubd    xmm0, xmm4
      psubd    xmm1, xmm5
      psubd    xmm2, xmm6
      psubd    xmm3, xmm7
    //
      movdqa   [48 + ar1], xmm3
      movdqa   [32 + ar1], xmm2
      movdqa   [16 + ar1], xmm1
      movdqa   [ar1], xmm0
    //Unrolle 5, 6, 7, 8
      movdqa   xmm4, [64 + ar2]
      movdqa   xmm5, [80 + ar2]
      movdqa   xmm6, [96 + ar2]
      movdqa   xmm7, [112 + ar2]
    //
      movdqa   xmm0, [64 + ar1]
      movdqa   xmm1, [80 + ar1]
      movdqa   xmm2, [96 + ar1]
      movdqa   xmm3, [112 + ar1]
    //
      psubd    xmm0, xmm4
      psubd    xmm1, xmm5
      psubd    xmm2, xmm6
      psubd    xmm3, xmm7
    //
      movdqa   [112 + ar1], xmm3
      movdqa   [96 + ar1], xmm2
      movdqa   [80 + ar1], xmm1
      movdqa   [64 + ar1], xmm0
    //
      add      ar1, 128
      add      ar2, 128
      dec      length
      jnz      @LoopUnrolled
    @FinishUnrolled:
      pop      length
      and      length, $1F
    //Do rest, up to 31 subtractions...
    @Finish:
      mov      ebx, [ar2]
      sub      [ar1], ebx
      add      ar1, 4
      add      ar2, 4
      dec      length
      jnz      @Finish
    @Exit:
      pop      ebx
    end;
    
    procedure WriteOut(EndTicks: Int64; Str: string);
    begin
      WriteLn(Str + IntToStr(EndTicks - StartTicks)
        + ' Time: ' + IntToStr((EndTicks - StartTicks) div TicksPerMicroSecond) + 'us');
      Sleep(5);
      SwitchToThread;
      StartTicks := GetCpuTicks;
    end;
    
    begin
      ArLength := 100000;
    //Set TicksPerMicroSecond
      QueryPerformanceFrequency(TicksPerMicroSecond);
      TicksPerMicroSecond := TicksPerMicroSecond div 1000000;
    //
      SetLength(Ar1, ArLength);
      SetLength(Ar2, ArLength);
    //Fill arrays
    //...
    //Tick time info
      WriteLn('CPU ticks per mikro second: ' + IntToStr(TicksPerMicroSecond));
      Sleep(5);
      SwitchToThread;
      StartTicks := GetCpuTicks;
    //Test 1
      SubArPasRangeOvfChkOn(ArLength);
      WriteOut(GetCpuTicks, 'SubAr Pas Range and Overflow Checking On, Ticks: ');
    //Test 2
      SubArPas(ArLength);
      WriteOut(GetCpuTicks, 'SubAr Pas, Ticks: ');
    //Test 3
      SubArAsm(Ar1[0], Ar2[0], ArLength);
      WriteOut(GetCpuTicks, 'SubAr Asm, Ticks: ');
    //Test 4
      SubArAsmSimdU(Ar1[0], Ar2[0], ArLength);
      WriteOut(GetCpuTicks, 'SubAr Asm SIMD mem unaligned, Ticks: ');
    //Test 5
      SubArAsmSimdA(Ar1[0], Ar2[0], ArLength);
      WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned, Ticks: ');
    //Test 6
      SubArAsmSimdAUnrolled8(Ar1[0], Ar2[0], ArLength);
      WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned 8*unrolled, Ticks: ');
    //
      ReadLn;
      Ar1 := nil;
      Ar2 := nil;
    end.
    

    ...

    最快的asm程序,8次展开的SIMD指令只需68us,比Pascal程序快4倍。

    正如我们所看到的那样,Pascal循环过程可能并不重要,只需要大约277us(溢出和范围检查)在2,4GHz CPU上进行100000次减法。

    所以这段代码不能成为瓶颈?

答案 3 :(得分:4)

我不是汇编专家,但我认为如果不考虑SIMD指令或并行处理,以下几乎是最优的,后者可以通过将部分数组传递给函数来轻松完成。


Thread1:SubArray(ar1 [0],ar2 [0],50);
Thread2:SubArray(ar1 [50],ar2 [50],50);

procedure SubArray(var Array1, Array2; const Length: Integer);
var
  ap1, ap2 : PInteger;
  i : Integer;
begin
  ap1 := @Array1;
  ap2 := @Array2;
  i := Length;
  while i > 0 do
  begin
    ap1^ := ap1^ - ap2^;
    Inc(ap1);
    Inc(ap2);
    Dec(i);
  end;
end;

// similar assembly version
procedure SubArrayEx(var Array1, Array2; const Length: Integer);
asm
  // eax = @Array1
  // edx = @Array2
  // ecx = Length
  // esi = temp register for array2^
  push esi
  cmp ecx, 0
  jle @Exit
  @Loop:
  mov esi, [edx]
  sub [eax], esi
  add eax, 4
  add edx, 4
  dec ecx
  jnz @Loop
  @Exit:
  pop esi
end;


procedure Test();
var
  a1, a2 : array of Integer;
  i : Integer;
begin
  SetLength(a1, 3);
  a1[0] := 3;
  a1[1] := 1;
  a1[2] := 2;
  SetLength(a2, 3);
  a2[0] := 4;
  a2[1] := 21;
  a2[2] := 2;
  SubArray(a1[0], a2[0], Length(a1));

  for i := 0 to Length(a1) - 1 do
    Writeln(a1[i]);

  Readln;
end;

答案 4 :(得分:2)

这不是你问题的真正答案,但是我会研究是否可以在某个时间进行减法,而用数值填充数组。我甚至可以选择在内存中考虑第三个数组来存储减法的结果。在现代计算中,内存的“成本”远低于对内存执行额外操作所花费的时间的“成本”。

理论上,当值仍然在寄存器或处理器缓存中时,您可以获得至少一点性能,但实际上您可能会偶然发现一些可以提高整个算法性能的技巧