我需要一个提示,如何使用SSE2程序集(32位)实现这个Delphi函数。其他优化也是受欢迎的。也许有人可以告诉我,可以使用什么样的指令,所以我有一个进一步阅读的起点。
实际值:
const Precision = 10000;
// This function adds all Pixels into one. The pixels are weighted before adding.
// A weight can range from 0 to "Precision". "Size" is typically 10 to 50.
function TFilter.Combine(Pixels: PByte; Weights: PCardinal; const Size: Cardinal): Cardinal;
var
i, R, G, B, A: Cardinal;
begin
B := Pixels^ * Weights^; Inc(Pixels);
G := Pixels^ * Weights^; Inc(Pixels);
R := Pixels^ * Weights^; Inc(Pixels);
A := Pixels^ * Weights^; Inc(Pixels);
Inc(Weights); // goto next weight
for i := 1 to Size - 1 do
begin
Inc(B, Pixels^ * Weights^); Inc(Pixels);
Inc(G, Pixels^ * Weights^); Inc(Pixels);
Inc(R, Pixels^ * Weights^); Inc(Pixels);
Inc(A, Pixels^ * Weights^); Inc(Pixels);
Inc(Weights); // goto next weight
end;
B := B div Precision;
G := G div Precision;
R := R div Precision;
A := A div Precision;
Result := A shl 24 or R shl 16 or G shl 8 or B;
end;
预期:
function TFilter.Combine(Pixels: PByte; Weights: PCardinal; const Size: Cardinal): Cardinal;
asm
// Insert fast SSE2-Code here ;-)
end;
答案 0 :(得分:10)
相当直接的实施。 我已经改变了你的函数原型 - 常规函数(针对对象方法)。
此代码的工作速度比字节每字节函数快3倍(256元素阵列1000000次迭代1500毫秒,旧式Athlon XP 2.2 GHz大约0.7 GB /秒)
function Combine(Pixels: PByte; Weights: PInteger; const Size: Cardinal): Integer;
//x86, register calling convention - three parameters in EAX, EDX, ECX
const
Precision: Single = 1.0;
asm
pxor XMM6, XMM6 //zero const
pxor XMM4, XMM4 // zero accum
@@cycle:
movd XMM1, [eax] //load color data
movss XMM3, [edx] //load weight
punpcklbw XMM1, XMM6 //bytes to words
shufps XMM3, XMM3, 0 // 4 x weight
punpcklwd XMM1, XMM6 //words to ints
cvtdq2ps XMM2, XMM3 //ints to singles
cvtdq2ps XMM0, XMM1 //ints to singles
mulps XMM0, XMM2 //data * weight
addps XMM4, XMM0 //accum = accum + data * weight
add eax, 4 // inc pointers
add edx, 4
loop @@cycle
movss XMM5, Precision
shufps XMM5, XMM5, 0 // 4 x precision constant
divps XMM4, XMM5 //accum/precision
cvtps2dq XMM2, XMM4 //rounding singles to ints
packssdw XMM2, XMM2 //ints to ShortInts
packuswb XMM2, XMM2 //ShortInts to bytes
movd eax, XMM2 //result
end;