以下代码:
while Assigned(p) do begin
np:= p^.next;
h:= leaf_hash(p^.data); <<-- inline routine
h:= h mod nhashprime;
p^.next:= nhashtab[h];
nhashtab[h]:= p;
p:= np;
end; { while }
生成以下程序集:
hlife.pas.605: h:= leaf_hash(p^.data);
00000000005D4602 498B4018 mov rax,[r8+$18]
00000000005D4606 48C1E830 shr rax,$30
00000000005D460A 498B5018 mov rdx,[r8+$18]
00000000005D460E 48C1EA20 shr rdx,$20
00000000005D4612 81E2FFFF0000 and edx,$0000ffff
00000000005D4618 4D8B5818 mov r11,[r8+$18]
00000000005D461C 49C1EB10 shr r11,$10
00000000005D4620 4181E3FFFF0000 and r11d,$0000ffff
00000000005D4627 418B7018 mov esi,[r8+$18]
00000000005D462B 81E6FFFF0000 and esi,$0000ffff
00000000005D4631 488D34F6 lea rsi,[rsi+rsi*8]
00000000005D4635 4403DE add r11d,esi
00000000005D4638 4F8D1CDB lea r11,[r11+r11*8]
00000000005D463C 4103D3 add edx,r11d
00000000005D463F 488D14D2 lea rdx,[rdx+rdx*8]
00000000005D4643 03C2 add eax,edx
hlife.pas.606: h:= h mod nhashprime;
00000000005D4645 8BC0 mov eax,eax <<--- Why is there a NOP here?
00000000005D4647 4C63DB movsxd r11,rbx
00000000005D464A 4899 cwd
00000000005D464C 49F7FB idiv r11
00000000005D464F 488BC2 mov rax,rdx
hlife.pas.607: p^.next:= nhashtab[h];
00000000005D4652 488B5538 mov rdx,[rbp+$38]
Delphi在nhashtab[h]:= p;
行之前插入NOP。
如果leaf_hash
函数是正常函数,则它是有意义的
(不是真的因为RET仍会返回[5D4645]执行nop)
但现在这不是跳跃目标。
所以我(只是)很好奇,为什么会这样做?
[编辑]:SSCCE
好的,我已经有了一个SSCCE {它不是很短,但它必须这样做。
请注意编译器设置(Debug + Win64)
unit Unit16;
interface
uses
Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls;
type
pnode = ^node;
tflavour = (tnode, tleaf, tleaf64);
node = record
case flavour: tflavour of
tnode: (next: pnode; (* hash link *)
nw, ne, sw, se: pnode; (* constant; nw not= 0 means nonleaf *)
res: pnode); (* cache *)
tleaf: (next1: pnode; (* hash link *)
isnode: pnode; (* must always be zero for leaves *)
nw1, ne1, sw1, se1: word; (* constant *)
res1, res2: word; (* constant *)
);
tleaf64: (next2: pnode; (* hash link *)
isnode1: pnode; (* must always be zero for leaves *)
data: Uint64; (* constant *)
res1a, res2a: word; (* constant *)
)
end;
ppnode = array of pnode;
THashBox = class(TPersistent)
strict private
leafhashpop: integer;
leafhashlimit: integer;
leafhashprime: integer;
leafhashtab: ppnode;
nodehashpop: integer;
nodehashlimit: integer;
nodehashprime: integer;
nodehashtab: ppnode;
private
TotalTime, Occurrences: Uint64;
StartTime, EndTime: Uint64;
procedure resize_leaves();
public
constructor Create;
end;
TForm16 = class(TForm)
Button1: TButton;
procedure Button1Click(Sender: TObject);
private
HashBox: THashBox;
public
end;
var
Form16: TForm16;
implementation
{$R *.dfm}
const
maxmem = 2000*1000*1000; {2GB}
var
alloced: cardinal;
function rdtsc: int64; assembler;
asm
{ xor eax,eax;
push rbx
cpuid
pop rbx }
rdtsc
end;
function node_hash(n: pnode): cardinal; { inline; } assembler; overload;
// var
// a: pnativeint;
// begin
// Result:= nativeint(n^.se) + 3 * (nativeint(n^.sw) + 3 * (nativeint(n^.ne) + 3 * nativeint(n^.nw) + 3));
asm
mov eax,[rcx+node.nw]
lea eax,[eax+eax*2+3]
add eax,[rcx+node.ne]
lea eax,[eax+eax*2]
add eax,[rcx+node.sw]
lea eax,[eax+eax*2]
add eax,[rcx+node.se]
end;
function leaf_hash(a, b, c, d: cardinal): cardinal; inline; overload;
begin
Result:= (d + 9 * (c + 9 * (b + 9 * a)))
end;
function leaf_hash(data: Uint64): cardinal; inline; overload;
begin
// Result:= d + 9 * (c + 9 * (b + 9 * a));
Result:= ((data shr 48) + 9 * (((data shr 32) and $FFFF) + 9 * (((data shr 16) and $FFFF) + 9 * (data and $FFFF))));
Inc(Result);
end;
procedure TForm16.Button1Click(Sender: TObject);
begin
HashBox:= THashBox.Create;
Hashbox.resize_leaves;
end;
function nextprime(old: integer): integer;
begin
Result:= 1009;
end;
constructor THashBox.Create;
begin
leafhashprime:= 7;
SetLength(leafhashtab, leafhashprime);
end;
procedure THashBox.resize_leaves();
var
i, i1, i2: integer;
nhashprime: Cardinal;
p: pnode;
nhashtab: ppnode;
np: pnode;
h: Integer;
n, n2: integer;
diff1, diff2: integer;
begin
nhashprime:= nextprime(4 * leafhashprime);
if (nhashprime * sizeof(pnode) > maxmem - alloced) then begin
leafhashlimit:= 2000 * 1000 * 1000;
exit;
end;
(*
* Don't let the hash table buckets take more than 4% of the
* memory. If we're starting to strain memory, let the buckets
* fill up a bit more.
*)
if (nhashprime > maxmem div 100) then begin
nhashprime:= nextprime(maxmem div 100);
if (nhashprime = leafhashprime) then begin
leafhashlimit:= 2000 * 1000 * 1000;
exit;
end;
end;
SetLength(nhashtab, nhashprime); //make a new table, do not resize the existing one.
alloced:= alloced + sizeof(pnode) * (nhashprime - leafhashprime);
diff1:= maxint;
for i1:= 0 to 100 do begin
n:= 0;
StartTime:= rdtsc;
for i:= 0 to leafhashprime - 1 do begin
p:= leafhashtab[i];
if Assigned(p) then begin
h:= node_hash(p);
h:= h mod nhashprime;
inc(n, h);
end;
end;
EndTime:= rdtsc;
if ((EndTime - StartTime) < diff1) then diff1:= (EndTime - StartTime);
end;
diff2:= maxint;
for i1:= 0 to 100 do begin
n2:= 0;
StartTime:= rdtsc;
for i:= 0 to leafhashprime - 1 do begin
p:= leafhashtab[i];
if Assigned(p) then begin
inc(n2);
end;
end;
EndTime:= rdtsc;
if (endtime - starttime) < diff2 then diff2:= endtime - starttime;
end;
TotalTime:= diff1 - diff2;
if n <> n2 then Occurrences:= nhashprime;
for i:= 0 to leafhashprime - 1 do begin
// for (p=hashtab[i]; p;) {
p:= leafhashtab[i];
while Assigned(p) do begin <<--- put a breakpoint here
np:= p^.next;
h:= leaf_hash(p^.data);
h:= h mod nhashprime;
p^.next:= nhashtab[h];
nhashtab[h]:= p;
p:= np;
end; { while }
end; { for i }
// free(hashtab);
leafhashtab:= nhashtab;
leafhashprime:= nhashprime;
leafhashlimit:= leafhashprime;
end;
end.
你会看到这个反汇编:
Unit16.pas.196: h:= h mod nhashprime;
000000000059CE4B 4863C0 movsxd rax,rax
000000000059CE4E 448B5528 mov r10d,[rbp+$28]
000000000059CE52 458BD2 mov r10d,r10d <<--- weird NOP here
000000000059CE55 4899 cwd
000000000059CE57 49F7FA idiv r10
000000000059CE5A 488BC2 mov rax,rdx
Unit16.pas.197: p^.next:= nhashtab[h];
000000000059CE5D 488B5538 mov rdx,[rbp+$38]
答案 0 :(得分:5)
答案是
MOV EAX,EAX
不是无操作
在x64上操作64位寄存器的低32位将使高32位归零 因此,上述说明应该理解为:
MOVZX RAX,EAX
根据AMD
32位操作的结果隐式零扩展为64位值。
答案 1 :(得分:4)
恕我直言,这不是nop
的对象,但听起来就像未经优化的生成代码,以及错误的自己变量签名。
h:= h mod nhashprime;
可分为:
mov eax,eax new h = eax, old h = eax // which does not mean anything
movsxd r11,rbx convert with sign nhashprime stored in rbx into temp registry r11
cwd signed rax into rdx:rax
idiv r11 signed divide rdx:rax by r11 -> rax=quotient, rdx=remainder
mov rax,rdx store remainder rdx into rax
您是否尝试启用代码生成优化?我想它会修复mov eax,eax
的内容。
但您的原始代码也进行了次优化。 在你的情况下你应该使用无符号算术。
并且,你应该更好地使用两个nhashprime
的幂,计算一个简单的and
二进制运算而不是慢速除法:
var h, nhashprimeMask: cardinal; // use UNSIGNED arithmetic here!
// here nhashprime is a POWER OF TWO (128,256,512,1024,2048...)
nhashprimeMask := nhashprime-1; // compute the mask
while Assigned(p) do begin
np:= p^.next;
h:= leaf_hash(p^.data) and nhashprimeMask;
p^.next:= nhashtab[h];
nhashtab[h]:= p;
p:= np;
end; { while }
这段代码会更快,并且应该更好地编译。
答案 2 :(得分:0)
这是一种优化,用于对齐代码,尤其是在循环中,以避免缓存行停滞等。