我有以下代码,这是我的应用程序的一部分的瓶颈。我所做的只是从另一个中减去Array。这两个阵列都有大约100000个元素。我正试图找到一种方法来提高性能。
var
Array1, Array2 : array of integer;
.....
// Code that fills the arrays
.....
for ix := 0 to length(array1)-1
Array1[ix] := Array1[ix] - Array2[ix];
end;
有人有建议吗?
答案 0 :(得分:9)
在多个线程上运行它,使用那个大的数组将净线性加速。正如他们所说,这是令人尴尬的平行。
答案 1 :(得分:9)
在更多线程上运行减法听起来不错,但100K整数sunstraction不占用大量CPU时间,所以也许是线程池...但是设置线程也有很多开销,所以短阵列的并行性能会降低线程比只有一个(主)线程!
您是否关闭了编译器设置,溢出和范围检查?
你可以尝试使用asm rutine,它非常简单......
类似的东西:
procedure SubArray(var ar1, ar2; length: integer);
asm
//length must be > than 0!
push ebx
lea ar1, ar1 -4
lea ar2, ar2 -4
@Loop:
mov ebx, [ar2 + length *4]
sub [ar1 + length *4], ebx
dec length
//Here you can put more folloving parts of rutine to more unrole it to speed up.
jz @exit
mov ebx, [ar2 + length *4]
sub [ar1 + length *4], ebx
dec length
//
jnz @Loop
@exit:
pop ebx
end;
begin
SubArray(Array1[0], Array2[0], length(Array1));
它可以更快......
编辑:使用SIMD说明添加了程序。
此过程请求SSE CPU支持。它可以在XMM寄存器中取4个整数并一次减去。也可以使用movdqa
代替movdqu
它更快,但您必须首先确保16字节对齐。您也可以像我的第一个asm案例一样解除XMM标准。 (我对速度测量感兴趣。:))
var
array1, array2: array of integer;
procedure SubQIntArray(var ar1, ar2; length: integer);
asm
//prepare length if not rounded to 4
push ecx
shr length, 2
jz @LengthToSmall
@Loop:
movdqu xmm1, [ar1] //or movdqa but ensure 16b aligment first
movdqu xmm2, [ar2] //or movdqa but ensure 16b aligment first
psubd xmm1, xmm2
movdqu [ar1], xmm1 //or movdqa but ensure 16b aligment first
add ar1, 16
add ar2, 16
dec length
jnz @Loop
@LengthToSmall:
pop ecx
push ebx
and ecx, 3
jz @Exit
mov ebx, [ar2]
sub [ar1], ebx
dec ecx
jz @Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec ecx
jz @Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
@Exit:
pop ebx
end;
begin
//Fill arrays first!
SubQIntArray(Array1[0], Array2[0], length(Array1));
答案 2 :(得分:5)
在这个简单的案例中,我对速度优化非常好奇。 所以我制作了6个简单的程序,测量数据块大小为100000的CPU时钟和时间;
检查图片和代码的结果以获取更多信息。
要获得16字节的内存对齐,请首先在文件'FastMM4Options.inc'指令中添加点{$ .define Align16Bytes} !强>
program SubTest;
{$APPTYPE CONSOLE}
uses
//In file 'FastMM4Options.inc' delite the dot in directive {$.define Align16Bytes}
//to get 16 byte memory alignment!
FastMM4,
windows,
SysUtils;
var
Ar1 :array of integer;
Ar2 :array of integer;
ArLength :integer;
StartTicks :int64;
EndTicks :int64;
TicksPerMicroSecond :int64;
function GetCpuTicks: int64;
asm
rdtsc
end;
{$R+}
{$Q+}
procedure SubArPasRangeOvfChkOn(length: integer);
var
n: integer;
begin
for n := 0 to length -1 do
Ar1[n] := Ar1[n] - Ar2[n];
end;
{$R-}
{$Q-}
procedure SubArPas(length: integer);
var
n: integer;
begin
for n := 0 to length -1 do
Ar1[n] := Ar1[n] - Ar2[n];
end;
procedure SubArAsm(var ar1, ar2; length: integer);
asm
//Length must be > than 0!
push ebx
lea ar1, ar1 - 4
lea ar2, ar2 - 4
@Loop:
mov ebx, [ar2 + length * 4]
sub [ar1 + length * 4], ebx
dec length
jnz @Loop
@exit:
pop ebx
end;
procedure SubArAsmSimdU(var ar1, ar2; length: integer);
asm
//Prepare length
push length
shr length, 2
jz @Finish
@Loop:
movdqu xmm1, [ar1]
movdqu xmm2, [ar2]
psubd xmm1, xmm2
movdqu [ar1], xmm1
add ar1, 16
add ar2, 16
dec length
jnz @Loop
@Finish:
pop length
push ebx
and length, 3
jz @Exit
//Do rest, up to 3 subtractions...
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
@Exit:
pop ebx
end;
procedure SubArAsmSimdA(var ar1, ar2; length: integer);
asm
push ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @exit
add ar1, 8
add ar2, 8
//Prepare length for 16 byte data transfer
push length
shr length, 2
jz @Finish
@Loop:
movdqa xmm1, [ar1]
movdqa xmm2, [ar2]
psubd xmm1, xmm2
movdqa [ar1], xmm1
add ar1, 16
add ar2, 16
dec length
jnz @Loop
@Finish:
pop length
and length, 3
jz @Exit
//Do rest, up to 3 subtractions...
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
@Exit:
pop ebx
end;
procedure SubArAsmSimdAUnrolled8(var ar1, ar2; length: integer);
asm
push ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @exit
add ar1, 8 //Align pointer to 16 byte
add ar2, 8 //Align pointer to 16 byte
//Prepare length for 16 byte data transfer
push length
shr length, 5 //8 * 4 subtructions per loop
jz @Finish //To small for LoopUnrolled
@LoopUnrolled:
//Unrolle 1, 2, 3, 4
movdqa xmm4, [ar2]
movdqa xmm5, [16 + ar2]
movdqa xmm6, [32 + ar2]
movdqa xmm7, [48 + ar2]
//
movdqa xmm0, [ar1]
movdqa xmm1, [16 + ar1]
movdqa xmm2, [32 + ar1]
movdqa xmm3, [48 + ar1]
//
psubd xmm0, xmm4
psubd xmm1, xmm5
psubd xmm2, xmm6
psubd xmm3, xmm7
//
movdqa [48 + ar1], xmm3
movdqa [32 + ar1], xmm2
movdqa [16 + ar1], xmm1
movdqa [ar1], xmm0
//Unrolle 5, 6, 7, 8
movdqa xmm4, [64 + ar2]
movdqa xmm5, [80 + ar2]
movdqa xmm6, [96 + ar2]
movdqa xmm7, [112 + ar2]
//
movdqa xmm0, [64 + ar1]
movdqa xmm1, [80 + ar1]
movdqa xmm2, [96 + ar1]
movdqa xmm3, [112 + ar1]
//
psubd xmm0, xmm4
psubd xmm1, xmm5
psubd xmm2, xmm6
psubd xmm3, xmm7
//
movdqa [112 + ar1], xmm3
movdqa [96 + ar1], xmm2
movdqa [80 + ar1], xmm1
movdqa [64 + ar1], xmm0
//
add ar1, 128
add ar2, 128
dec length
jnz @LoopUnrolled
@FinishUnrolled:
pop length
and length, $1F
//Do rest, up to 31 subtractions...
@Finish:
mov ebx, [ar2]
sub [ar1], ebx
add ar1, 4
add ar2, 4
dec length
jnz @Finish
@Exit:
pop ebx
end;
procedure WriteOut(EndTicks: Int64; Str: string);
begin
WriteLn(Str + IntToStr(EndTicks - StartTicks)
+ ' Time: ' + IntToStr((EndTicks - StartTicks) div TicksPerMicroSecond) + 'us');
Sleep(5);
SwitchToThread;
StartTicks := GetCpuTicks;
end;
begin
ArLength := 100000;
//Set TicksPerMicroSecond
QueryPerformanceFrequency(TicksPerMicroSecond);
TicksPerMicroSecond := TicksPerMicroSecond div 1000000;
//
SetLength(Ar1, ArLength);
SetLength(Ar2, ArLength);
//Fill arrays
//...
//Tick time info
WriteLn('CPU ticks per mikro second: ' + IntToStr(TicksPerMicroSecond));
Sleep(5);
SwitchToThread;
StartTicks := GetCpuTicks;
//Test 1
SubArPasRangeOvfChkOn(ArLength);
WriteOut(GetCpuTicks, 'SubAr Pas Range and Overflow Checking On, Ticks: ');
//Test 2
SubArPas(ArLength);
WriteOut(GetCpuTicks, 'SubAr Pas, Ticks: ');
//Test 3
SubArAsm(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm, Ticks: ');
//Test 4
SubArAsmSimdU(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm SIMD mem unaligned, Ticks: ');
//Test 5
SubArAsmSimdA(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned, Ticks: ');
//Test 6
SubArAsmSimdAUnrolled8(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned 8*unrolled, Ticks: ');
//
ReadLn;
Ar1 := nil;
Ar2 := nil;
end.
...
最快的asm程序,8次展开的SIMD指令只需68us,比Pascal程序快4倍。
正如我们所看到的那样,Pascal循环过程可能并不重要,只需要大约277us(溢出和范围检查)在2,4GHz CPU上进行100000次减法。
所以这段代码不能成为瓶颈?
答案 3 :(得分:4)
我不是汇编专家,但我认为如果不考虑SIMD指令或并行处理,以下几乎是最优的,后者可以通过将部分数组传递给函数来轻松完成。
像
Thread1:SubArray(ar1 [0],ar2 [0],50);
Thread2:SubArray(ar1 [50],ar2 [50],50);
procedure SubArray(var Array1, Array2; const Length: Integer);
var
ap1, ap2 : PInteger;
i : Integer;
begin
ap1 := @Array1;
ap2 := @Array2;
i := Length;
while i > 0 do
begin
ap1^ := ap1^ - ap2^;
Inc(ap1);
Inc(ap2);
Dec(i);
end;
end;
// similar assembly version
procedure SubArrayEx(var Array1, Array2; const Length: Integer);
asm
// eax = @Array1
// edx = @Array2
// ecx = Length
// esi = temp register for array2^
push esi
cmp ecx, 0
jle @Exit
@Loop:
mov esi, [edx]
sub [eax], esi
add eax, 4
add edx, 4
dec ecx
jnz @Loop
@Exit:
pop esi
end;
procedure Test();
var
a1, a2 : array of Integer;
i : Integer;
begin
SetLength(a1, 3);
a1[0] := 3;
a1[1] := 1;
a1[2] := 2;
SetLength(a2, 3);
a2[0] := 4;
a2[1] := 21;
a2[2] := 2;
SubArray(a1[0], a2[0], Length(a1));
for i := 0 to Length(a1) - 1 do
Writeln(a1[i]);
Readln;
end;
答案 4 :(得分:2)
这不是你问题的真正答案,但是我会研究是否可以在某个时间进行减法,而用数值填充数组。我甚至可以选择在内存中考虑第三个数组来存储减法的结果。在现代计算中,内存的“成本”远低于对内存执行额外操作所花费的时间的“成本”。
理论上,当值仍然在寄存器或处理器缓存中时,您可以获得至少一点性能,但实际上您可能会偶然发现一些可以提高整个算法性能的技巧