我正在尝试在Delphi(XE6)中用两个xmm
整数加载128位UInt64
寄存器。
XMM寄存器为128位,可以加载多个独立的整数。然后,您可以让CPU并行地添加这些多个整数。
例如,您可以分别使用四个UInt32加载xmm0和xmm1,然后让CPU同时添加所有四对。
xmm0: $00001000 $00000100 $00000010 $00000001
+ + + +
xmm1: $00002000 $00000200 $00000020 $00000002
= = = =
xmm0: $00003000 $00000300 $00000030 $00000003
在加载xmm0和xmm0之后,您可以使用以下命令对四对进行加法运算:
paddd xmm0, xmm1 //Add packed 32-bit integers (i.e. xmm0 := xmm0 + xmm1)
您还可以使用8 x 16位整数进行操作:
xmm0: $001F $0013 $000C $0007 $0005 $0003 $0002 $0001
+ + + + + + + +
xmm1: $0032 $001F $0013 $000C $0007 $0005 $0003 $0002
= = = = = = = =
xmm0: $0051 $0032 $001F $0013 $000C $0007 $0005 $0003
按照说明
paddw xmm0, xmm1 //Add packed 16-bit integers
要将两个64位整数加载到xmm
寄存器中,您必须使用以下任一方法:
在这个简单的示例中,我们不必担心UInt64会对齐,我们只使用 unaligned 版本(movdqu
)
我们要处理的第一件事是Delphi编译器知道 movdqu
需要128位的 加载-正在加载 double quadwords。
为此,我们将创建一个128位结构,这也使我们能够很好地处理两个64位值:
TDoubleQuadword = packed record
v1: UInt64; //value 1
v2: UInt64; //value 2
end;
现在我们可以在测试控制台应用程序中使用此类型:
procedure Main;
var
x, y: TDoubleQuadword;
begin
//[1,5] + [2,7] = ?
x.v1 := $0000000000000001;
x.v2 := $0000000000000005;
y.v1 := $0000000000000002;
y.v2 := $0000000000000007;
asm
movdqu xmm0, x //move unaligned double quadwords (xmm0 := x)
movdqu xmm1, y //move unaligned double quadwords (xmm1 := y)
paddq xmm0, xmm1 //add packed quadword integers (xmm0 := xmm0 + xmm1)
movdqu x, xmm0 //move unaligned double quadwords (x := xmm0)
end;
WriteLn(IntToStr(x.v1)+', '+IntToSTr(x.v2));
end;
这可行,打印出来:
3, 12
着眼于使x和y对齐的目标(但不是我问题的必要部分),可以说我们有一个 pointer TDoubleQuadword
结构:
TDoubleQuadword = packed record
v1: UInt64; //value 1
v2: UInt64; //value 2
end;
PDoubleQuadword = ^TDoubleQuadword;
我们现在将假设检验功能更改为使用PDoubleQuadword
:
procedure AlignedStuff;
var
x, y: PDoubleQuadword;
begin
x := GetMemory(sizeof(TDoubleQuadword));
x.v1 := $0000000000000001;
x.v2 := $0000000000000005;
y := GetMemory(sizeof(TDoubleQuadword));
y.v1 := $0000000000000002;
y.v2 := $0000000000000007;
asm
movdqu xmm0, x //move unaligned double quadwords (xmm0 := x)
movdqu xmm1, y //move unaligned double quadwords (xmm1 := y)
paddq xmm0, xmm1 //add packed quadword integers (xmm0 := xmm0 + xmm1)
movdqu x, xmm0 //move unaligned double quadwords (v1 := xmm0)
end;
WriteLn(IntToStr(x.v1)+', '+IntToSTr(x.v2));
end;
现在,它无法编译,这是有道理的:
movdqu xmm0, x //E2107 Operand size mismatch
这很有道理。 x
参数必须为128位,并且编译器知道x
实际上只是一个(32位)指针。
现在我们问我的问题:那应该是什么?我随意地将键盘上的各种东西混在一起,希望编译器之神能够接受我显然的意思。但是什么都行不通。
//Don't try to pass the 32-bit pointer itself, pass the thing it points to:
movdqu xmm0, x^ //E2107 Operand size mismatch
//Try casting it
movdqu xmm0, TDoubleQuadword(x^) //E2105 Inline assembler error
//i've seen people using square brackets to mean "contents of":
movdqu xmm0, [x] //E2107 Operand size mismatch
现在我们放弃理性思考
movdqu xmm0, Pointer(x)
movdqu xmm0, Addr(x^)
movdqu xmm0, [Addr(x^)]
movdqu xmm0, [Pointer(TDoubleQuadword(x))^]
我确实可以编译一件事:
movdqu xmm0, TDoubleQuadword(x)
但是当然可以将x
的地址加载到寄存器中,而不是x中的值。
所以我放弃了。
program Project3;
{$APPTYPE CONSOLE}
{$R *.res}
uses
System.SysUtils;
type
TDoubleQuadword = packed record
v1: UInt64; //value 1
v2: UInt64; //value 2
end;
PDoubleQuadword = ^TDoubleQuadword;
TVectorUInt64 = array[0..15] of UInt64;
PVectorUInt64 = ^TVectorUInt64;
procedure AlignedStuff;
var
x, y: PVectorUInt64;
begin
x := GetMemory(sizeof(TVectorUInt64));
//x[0] := ...
//x[1] := ...
// ...
//x[3] := ...
x[4] := $0000000000000001;
x[5] := $0000000000000005;
y := GetMemory(sizeof(TVectorUInt64));
//y[0] := ...
//y[1] := ...
// ...
//y[3] := ...
y[4] := $0000000000000002;
y[5] := $0000000000000007;
asm
movdqu xmm0, TDoubleQuadword(x[4]) //move unaligned double quadwords (xmm0 := x)
movdqu xmm1, TDoubleQuadword(y[4]) //move unaligned double quadwords (xmm1 := y)
paddq xmm0, xmm1 //add packed quadword integers (xmm0 := xmm0 + xmm1)
movdqu TDoubleQuadword(x[4]), xmm0 //move unaligned double quadwords (v1 := xmm0)
end;
WriteLn(IntToStr(x[4])+', '+IntToSTr(x[5]));
end;
begin
try
AlignedStuff;
Writeln('Press enter to close...');
Readln;
except
on E: Exception do
Writeln(E.ClassName, ': ', E.Message);
end;
end.
该问题询问指针的原因是:
如果我举一个仅涉及添加UInt64s的代码示例:
TVectorUInt64 = array[0..15] of UInt64;
PVectorUInt64 = ^TVectorUInt64;
var
v: PVectorUInt64;
begin
v := GetMemoryAligned(sizeof(TVectorUInt64), 64); //64-byte alignment
//v is initalized
for i := 0 to 15 do
begin
v[0] := v[0] + v[4];
v[1] := v[1] + v[5];
v[2] := v[2] + v[6];
v[3] := v[3] + v[7];
//..and some more changes to v0..v3
//..and some more changes to v12..v15
v[8] := v[8] + v[12];
v[9] := v[9] + v[13];
v[10] := v[10] + v[14];
v[11] := v[11] + v[15];
//...and some more changes to v4..v7
v[0] := v[0] + v[4];
v[1] := v[1] + v[5];
v[2] := v[2] + v[6];
v[3] := v[3] + v[7];
//...and some more changes to v0..v3
//...and some more changes to v12..v15
v[8] := v[8] + v[12];
v[9] := v[9] + v[13];
v[10] := v[10] + v[14];
v[11] := v[11] + v[15];
//...and some more changes to v4..v7
v[0] := v[0] + v[4];
v[1] := v[1] + v[5];
v[2] := v[2] + v[6];
v[3] := v[3] + v[7];
//..and some more changes to v0..v3
//..and some more changes to v12..v15
v[8] := v[8] + v[12];
v[9] := v[9] + v[13];
v[10] := v[10] + v[14];
v[11] := v[11] + v[15];
//...and some more changes to v4..v7
v[0] := v[0] + v[4];
v[1] := v[1] + v[5];
v[2] := v[2] + v[6];
v[3] := v[3] + v[7];
//...and some more changes to v0..v3
//...and some more changes to v12..v15
v[8] := v[8] + v[12];
v[9] := v[9] + v[13];
v[10] := v[10] + v[14];
v[11] := v[11] + v[15];
//...and some more changes to v4..v7
end;
从概念上讲,将代码更改为:
//v[0] := v[0] + v[4];
//v[1] := v[1] + v[5];
asm
movdqu xmm0, v[0]
movdqu xmm1, v[4]
paddq xmm0, xmm1
movdqu v[0], xmm0
end
//v[2] := v[2] + v[6];
//v[3] := v[3] + v[7];
asm
movdqu xmm0, v[2]
movdqu xmm1, v[6]
paddq xmm0, xmm1
movdqu v[2], xmm0
end
//v[8] := v[8] + v[12];
//v[9] := v[9] + v[13];
asm
movdqu xmm0, v[8]
movdqu xmm1, v[12]
paddq xmm0, xmm1
movdqu v[8], xmm0
end
//v[10] := v[10] + v[14];
//v[11] := v[11] + v[15];
asm
movdqu xmm0, v[10]
movdqu xmm1, v[14]
paddq xmm0, xmm1
movdqu v[10], xmm0
end
诀窍是让Delphi编译器接受它。
[contentsOfSquareBrackets]
会起作用使用David的解决方案(函数调用开销较大)可将性能提高-7%(算法吞吐量为90 MB / s-> 83 MB / s)
在XE6编译器中,从概念上调用似乎是有效的
movdqu xmm0, TPackedQuadword
但是编译器没有大脑让您执行概念上的调用:
movdqu xmm0, PPackedQuadword^
或者在道德上是等价的。
如果这是答案,请不要害怕。拥抱它,并将其作为答案的形式:
*“编译器不支持在
asm
块中取消对指针的引用。无论您使用尖号(^
)还是使用方括号([...]
)。只是做不到。
如果答案是:
如果不是这种情况,则编译器可以在asm
块中支持指针,然后发布答案。
答案 0 :(得分:3)
Delphi中的内联汇编器文档不如应有的全面,许多功能根本没有文档记录。因此,我不确定,但是据我所知,您尝试编写的汇编器语句根本不支持,其中一个操作数是指针类型的局部变量。
我强烈建议您避免在同一函数中混用Pascal代码和汇编代码。在同一功能中的Pascal代码和汇编代码之间移动时,很难产生有效的代码,并且很难管理寄存器的使用。
我个人将禁止混合Pascal和内联汇编程序作为规则。始终编写纯汇编程序函数。例如,对于32位代码,您将编写一个像这样的完整程序:
POST <your_index_name>/_search
{
"query": {
"bool": {
"must": [
{
"script": {
"script": """
if(doc['books.name'].value=="book1" && doc['books.sale'].value==true)
return true;
"""
}
}
]
}
}
}
该程序输出:
3, 12
或者您可以将记录用于运算符:
{$APPTYPE CONSOLE}
type
PDoubleQuadword = ^TDoubleQuadword;
TDoubleQuadword = record
v1: UInt64;
v2: UInt64;
end;
function AddDoubleQuadword(const dqw1, dqw2: TDoubleQuadword): TDoubleQuadword;
asm
movdqu xmm0, [eax]
movdqu xmm1, [edx]
paddq xmm0, xmm1
movdqu [ecx], xmm0
end;
procedure AlignedStuff;
var
x, y: PDoubleQuadword;
begin
New(x);
x.v1 := $0000000000000001;
x.v2 := $0000000000000005;
New(y);
y.v1 := $0000000000000002;
y.v2 := $0000000000000007;
x^ := AddDoubleQuadword(x^, y^);
Writeln(x.v1, ', ', x.v2);
end;
begin
AlignedStuff;
Readln;
end.
然后在呼叫站点上拥有
type
PDoubleQuadword = ^TDoubleQuadword;
TDoubleQuadword = record
v1: UInt64;
v2: UInt64;
class operator Add(const dqw1, dqw2: TDoubleQuadword): TDoubleQuadword;
end;
class operator TDoubleQuadword.Add(const dqw1, dqw2: TDoubleQuadword): TDoubleQuadword;
asm
movdqu xmm0, [eax]
movdqu xmm1, [edx]
paddq xmm0, xmm1
movdqu [ecx], xmm0
end;
答案 1 :(得分:1)
工作代码:
asm
mov eax, x
mov edx, y
movdqu xmm0, DQWORD PTR [eax] //move unaligned double quadwords (xmm0 := x)
movdqu xmm1, DQWORD PTR [edx] //move unaligned double quadwords (xmm1 := y)
paddq xmm0, xmm1 //add packed quadword integers (xmm0 := xmm0 + xmm1)
movdqu DQWORD PTR [eax], xmm0 //move unaligned double quadwords (v1 := xmm0)
end;
IntToStr(x.v1)+', '+IntToSTr(x.v2); prints 3,12