x86 assembly(MASM) - 64位整数的平方根?

时间:2012-01-10 19:02:28

标签: assembly x86 masm masm32

我正在用x86汇编语言(MASM32)为Windows编写一个简单的素性测试程序,它涉及计算(64位)整数的平方根。我的问题是:有没有简单的方法来获得平方根?我应该使用ADD / SUB / DIV / MUL指令的某些组合吗?

我发现了一些关于如何用C语言实现这一目标的information,但我只是想知道我是否在这里遗漏了什么?

6 个答案:

答案 0 :(得分:8)

我认为最简单的方法是使用FPU指令fsqrt,如下所示:

.data?
int64 dq ?
squareRoot dd ?

.code
fild int64        ;load the integer to ST(0)
fsqrt             ;compute square root and store to ST(0)
fistp squareRoot  ;store the result in memory (as a 32-bit integer) and pop ST(0)

答案 1 :(得分:2)

除了使用FSQRT指令之外,您的意思是什么?当然,它是浮点数,但它应该很容易进行转换。否则你必须使用像Newton's Method这样的东西。

答案 2 :(得分:2)

numerous algorithmstechniques用于计算数字的平方根。事实上,使用Newton-Raphson方法计算平方根是Numeric Analysis学生的标准作业,作为方程finding the root的一般情况的一个例子。

如果没有对代码进行分析和基准测试,并且知道您是否需要单个或多个平方根(SIMD计算,例如通过SSE / SSE2),我建议您从@ Smi的answer开始,它使用x87 FPU FSQRT指令,作为基线实现。这确实会导致load-store hit(快速摘要:在FPU和CPU的ALU之间移动中断缓存和管道),这可能会否定使用内置FPU指令的优势。

由于您提到了主要测试,我猜测每个候选数字仅使用一次sqrt来确定搜索范围(任何非平凡因素都在2< = f< = sqrt(n)之间,其中n是正在测试的数字)。如果你只是测试特定数字的素数,那也没关系,但是对于搜索大量数字你会为每个候选人做平方根。如果你正在做"classic" test(前椭圆曲线),可能不值得担心。

答案 3 :(得分:0)

最后,除了80386+微处理器的最后一个函数之外,我写了一个处理64位无符号整数的平方根,这是一个新的优化函数,适用于8086+微处理器,用于汇编。

我已经成功测试了所有这些例程。

它们是我编写的第一个例程(参见本文底部),但更加优化。因为很简单,我在Pascal中显示了一个16位算法的例子如下(半截面算法):

Function NewSqrt(X:Word):Word;

Var L,H,M:LongInt;

Begin
 L:=1;
 H:=X;
 M:=(X+1) ShR 1;
 Repeat
  If Sqr(M)>X Then
   H:=M
  Else
   L:=M;
  M:=(L+H) ShR 1;
 Until H-L<2;
 NewSqrt:=M;
End;

这个例程通过半截面算法工作,并支持64位无符号整数的平方根。遵循8086+ CPU新的实模式代码:

Function NewSqrt16(XLow,XHigh:LongInt):LongInt; Assembler;

Var X0,X1,X2,X3,
    H0,H1,H2,H3,
    L0,L1,L2,L3,
    M0,M1,M2,M3:Word;

Asm

     LEA   SI,XLow
     Mov   AX,[SS:SI]
     Mov   BX,[SS:SI+2]

     LEA   SI,XHigh
     Mov   CX,[SS:SI]
     Mov   DX,[SS:SI+2]
    {------------------------
     INPUT: DX:CX:BX:AX = X
     OUPUT: DX:AX= Sqrt(X).
     ------------------------
         X0    :    X1    :    X2    :    X3    -> X
         H0    :    H1    :    H2    :    H3    -> H
         L0    :    L1    :    L2    :    L3    -> L
         M0    :    M1    :    M2    :    M3    -> M
         AX    ,    BX    ,    CX    ,    DX    ,
         ES    ,    DI    ,    SI               -> Temp. reg.
     ------------------------}
     Mov   [X0],AX           {  ...}
     Mov   [X1],BX           {  ...}
     Mov   [X2],CX           {  ...}
     Mov   [X3],DX           {  Stack <- X}

     Mov   [H0],AX           {  ...}
     Mov   [H1],BX           {  ...}
     Mov   [H2],CX           {  ...}
     Mov   [H3],DX           {  Stack <- H= X}

     Mov   [L0],Word(1)      {  ...}
     Mov   [L1],Word(0)      {  ...}
     Mov   [L2],Word(0)      {  ...}
     Mov   [L3],Word(0)      {  Stack <- L= 1}

     Add   AX,1              {  ...}
     AdC   Bx,0              {  ...}
     AdC   Cx,0              {  ...}
     AdC   Dx,0              {  X= X+1}

     RCR   Dx,1              {  ...}
     RCR   Cx,1              {  ...}
     RCR   Bx,1              {  ...}
     RCR   Ax,1              {  X= (X+1)/2}

     Mov   [M0],AX           {  ...}
     Mov   [M1],BX           {  ...}
     Mov   [M2],CX           {  ...}
     Mov   [M3],DX           {  Stack <- M= (X+1)/2}
    {------------------------}
@@LoopBegin:                 {Loop restart label}

     Mov   AX,[M3]           {If M is more ...}
     Or    AX,[M2]           {... then 32 bit ...}
     JNE   @@LoadMid         {... then Square(M)>X, jump}
    {DX:AX:CX:SI= 64 Bit square(Low(M))}
     Mov   AX,[M0]           {AX <- A=Low(M)}
     Mov   CX,AX             {CX <- A=Low(M)}

     Mul   AX                {DX:AX <- A*A}

     Mov   SI,AX             {SI <- Low 16 bit of last mul.}
     Mov   BX,DX             {BX:SI <- A*A}

     Mov   AX,[M1]           {AX <- D=High(M)}
     XChg  CX,AX             {AX <- A=Low(M); CX <- D=High(M)}

     Mul   CX                {DX:AX <- A*D=Low(M)*High(M)}

     XOr   DI,DI             {...}
     ShL   AX,1              {...}
     RCL   DX,1              {...}
     RCL   DI,1              {DI:DX:AX <- A*D+D*A= 2*A*D (33 Bit}

     Add   AX,BX             {...}
     AdC   DX,0              {...}
     AdC   DI,0              {DI:DX:AX:SI <- A*(D:A)+(D*A) ShL 16 (49 Bit)}

     XChg  CX,AX             {AX <- D=High(M); CX <- Low 16 bit of last mul.}
     Mov   BX,DX             {DI:BX:CX:SI <- A*(D:A)+(D*A) ShL 16 (49 Bit)}

     Mul   AX                {DX:AX <- D*D}

     Add   AX,BX             {...}
     AdC   DX,DI             {DX:AX:CX:SI <- (D:A)*(D:A)}
    {------------------------}
     Cmp   DX,[X3]           {Compare High(Square(M)):High(X)}

@@LoadMid:                   {Load M in DX:BP:BX:DI}

     Mov   DI,[M0]           {...}
     Mov   BX,[M1]           {...}
     Mov   ES,[M2]           {...}
     Mov   DX,[M3]           {DX:ES:BX:DI <- M}

     JA    @@SqrMIsMoreThenX {If High(Square(M))>High(X) then Square(M)>X, jump}
     JB    @@SqrMIsLessThenX {If High(Square(M))<High(X) then Square(M)<X, jump}

     Cmp   AX,[X2]           {Compare High(Square(M)):High(X)}
     JA    @@SqrMIsMoreThenX {If High(Square(M))>High(X) then Square(M)>X, jump}
     JB    @@SqrMIsLessThenX {If High(Square(M))<High(X) then Square(M)<X, jump}

     Cmp   CX,[X1]           {Compare High(Square(M)):High(X)}
     JA    @@SqrMIsMoreThenX {If High(Square(M))>High(X) then Square(M)>X, jump}
     JB    @@SqrMIsLessThenX {If High(Square(M))<High(X) then Square(M)<X, jump}

     Cmp   SI,[X0]           {Compare Low(Square(M)):Low(X)}
     JA    @@SqrMIsMoreThenX {If Low(Square(M))>Low(X) then Square(M)>X, jump}
    {------------------------}
@@SqrMIsLessThenX:           {Square(M)<=X}

     Mov   [L0],DI           {...}
     Mov   [L1],BX           {...}
     Mov   [L2],ES           {...}
     Mov   [L3],DX           {L= M}

     Jmp   @@ProcessMid      {Go to process the mid value}
    {------------------------}
@@SqrMIsMoreThenX:           {Square(M)>X}

     Mov   [H0],DI           {...}
     Mov   [H1],BX           {...}
     Mov   [H2],ES           {...}
     Mov   [H3],DX           {H= M}
    {------------------------}
@@ProcessMid:                {Process the mid value}

     Mov   SI,[H0]           {...}
     Mov   CX,[H1]           {...}
     Mov   AX,[H2]           {...}
     Mov   DX,[H3]           {DX:AX:CX:SI <- H}

     Mov   DI,SI             {DI <- H0}
     Mov   BX,CX             {BX <- H1}

     Add   SI,[L0]           {...}
     AdC   CX,[L1]           {...}
     AdC   AX,[L2]           {...}
     AdC   DX,[L3]           {DX:AX:CX:SI <- H+L}

     RCR   DX,1              {...}
     RCR   AX,1              {...}
     RCR   CX,1              {...}
     RCR   SI,1              {DX:AX:CX:SI <- (H+L)/2}

     Mov   [M0],SI           {...}
     Mov   [M1],CX           {...}
     Mov   [M2],AX           {...}
     Mov   [M3],DX           {M <- DX:AX:CX:SI}
    {------------------------}
     Mov   AX,[H2]           {...}
     Mov   DX,[H3]           {DX:AX:BX:DI <- H}

     Sub   DI,[L0]           {...}
     SbB   BX,[L1]           {...}
     SbB   AX,[L2]           {...}
     SbB   DX,[L3]           {DX:AX:BX:DI <- H-L}

     Or    BX,AX             {If (H-L) >= 65536 ...}
     Or    BX,DX             {...}
     JNE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}

     Cmp   DI,2              {If (H-L) >= 2 ...}
     JAE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}
    {------------------------}    
     Mov   AX,[M0]           {...}
     Mov   DX,[M1]           {@Result <- Sqrt}

End;

输入中的此函数接收64位数字(XHigh:XLow)并返回它的32位平方根。使用四个局部变量:

X, the copy of input number, subdivided in four 16 Bit packages (X3:X2:X1:X0).
H, upper limit, subdivided in four 16 Bit packages (H3:H2:H1:H0).
L, lower limit, subdivided in four 16 Bit packages (L3:L2:L1:L0).
M, mid value, subdivided in four 16 Bit packages (M3:M2:M1:M0).

将下限L初始化为1;将上限H初始化为输入数X;将中间值M初始化为(H + 1)> 1。通过验证(M3 | M2)!= 0,测试M的平方是否长于64位;如果为真,则square(M)> X,将上限H设置为M. 如果不成立,则按如下方式处理M的下32位(M1:M0)的平方:

(M1:M0)*(M1:M0)=

M0*M0+((M0*M1)<<16)+((M1*M0)<<16)+((M1*M1)<<32)=
M0*M0+((M0*M1)<<17)+((M1*M1)<<32)

如果M的低32位的平方大于X,则将上限H设置为M;如果M的低32位的平方小于或等于X值,则将下限L设置为M. 处理中间值M,将其设置为(L + H)> 1。如果(H-L)<2则M是X的平方根,否则去测试M的平方是否长于64位并运行以下指令。

这个例程通过半截面算法工作,并支持64位无符号整数的平方根。遵循80386+ CPU新保护模式代码:

Procedure NewSqrt32(X:Int64;Var Y:Int64); Assembler;

{INPUT: EDX:EAX = X
  TEMP: ECX
 OUPUT: Y = Sqrt(X)}

Asm

     Push  EBX               {Save EBX register into the stack}
     Push  ESI               {Save ESI register into the stack}
     Push  EDI               {Save EDI register into the stack}
    {------------------------}
     Mov   EAX,Y             {Load address of output var. into EAX reg.}

     Push  EAX               {Save address of output var. into the stack}
    {------------------------}
     LEA   EDX,X             {Load address of input var. into EDX reg.}
     Mov   EAX,[EDX]         {    EAX <- Low 32 Bit of input value (X)}
     Mov   EDX,[EDX+4]       {EDX:EAX <- Input value (X)}
    {------------------------
     [ESP+ 4]:[ESP   ]        -> X
      EBX    : ECX            -> M
      ESI    : EDI            -> L
     [ESP+12]:[ESP+ 8]        -> H
      EAX    , EDX            -> Temporary registers
     ------------------------}
     Mov   EDI,1             {    EDI <-  Low(L)= 1}
     XOr   ESI,ESI           {    ESI <- High(L)= 0}

     Mov   ECX,EAX           {    ECX <- Low 32 Bit of input value (X)}
     Mov   EBX,EDX           {EBX:ECX <-       M= Input value (X)}

     Add   ECX,EDI           {    ECX <- Low 32 Bit of (X+1)}
     AdC   EBX,ESI           {EBX:ECX <-       M= X+1}

     RCR   EBX,1             {    EBX <- High 32 Bit of (X+1)/2}
     RCR   ECX,1             {EBX:ECX <-       M= (X+1)/2}
    {------------------------}
     Push  EDX               {  Stack <- High(H)= High(X)}
     Push  EAX               {  Stack <-  Low(H)=  Low(X)}

     Push  EDX               {  Stack <- High(X)}
     Push  EAX               {  Stack <-  Low(X)}
    {------------------------}    
@@LoopBegin:                 {Loop restart label}

     Cmp   EBX,0             {If M is more then 32 bit ...}
     JNE   @@SqrMIsMoreThenX {... then Square(M)>X, jump}

     Mov   EAX,ECX           {EAX     <- A= Low(M)}

     Mul   ECX               {EDX:EAX <- 64 Bit square(Low(M))}

     Cmp   EDX,[ESP+4]       {Compare High(Square(M)):High(X)}
     JA    @@SqrMIsMoreThenX {If High(Square(M))>High(X) then Square(M)>X, jump}
     JB    @@SqrMIsLessThenX {If High(Square(M))<High(X) then Square(M)<X, jump}

     Cmp   EAX,[ESP]         {Compare Low(Square(M)):Low(X)}
     JA    @@SqrMIsMoreThenX {If Low(Square(M))>Low(X) then Square(M)>X, jump}
    {------------------------}
@@SqrMIsLessThenX:           {Square(M)<=X}

     Mov   EDI,ECX           {Set Low 32 Bit of L with Low 32 Bit of M}
     Mov   ESI,EBX           {ESI:EDI <- L= M}

     Jmp   @@ProcessMid      {Go to process the mid value}
    {------------------------}
@@SqrMIsMoreThenX:           {Square(M)>X}

     Mov   [ESP+8],ECX       {Set Low 32 Bit of H with Low 32 Bit of M}
     Mov   [ESP+12],EBX      {H= M}
    {------------------------}
@@ProcessMid:                {Process the mid value}

     Mov   EAX,[ESP+8]       {EAX     <- Low 32 Bit of H}
     Mov   EDX,[ESP+12]      {EDX:EAX <- H}

     Mov   ECX,EDI           {Set Low 32 Bit of M with Low 32 Bit of L}
     Mov   EBX,ESI           {EBX:ECX <- M= L}

     Add   ECX,EAX           {Set Low 32 Bit of M with Low 32 Bit of (L+H)}
     AdC   EBX,EDX           {EBX:ECX <- M= L+H}

     RCR   EBX,1             {Set High 32 Bit of M with High 32 Bit of (L+H)/2}
     RCR   ECX,1             {EBX:ECX <- M= (L+H)/2}
    {------------------------}
     Sub   EAX,EDI           {EAX     <- Low 32 Bit of (H-L)}
     SbB   EDX,ESI           {EDX:EAX <- H-L}

     Cmp   EDX,0             {If High 32 Bit of (H-L) aren't zero: (H-L) >= 2 ...}
     JNE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}

     Cmp   EAX,2             {If (H-L) >= 2 ...}
     JAE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}
    {------------------------}
     Add   ESP,16            {Remove 4 local 32 bit variables from stack}
    {------------------------}
     Pop   EDX               {Load from stack the output var. address into EDX reg.}

     Mov   [EDX],ECX         {Set Low 32 Bit of output var. with Low 32 Bit of Sqrt(X)}
     Mov   [EDX+4],EBX       {Y= Sqrt(X)}
    {------------------------}
     Pop   EDI               {Retrieve EDI register from the stack}
     Pop   ESI               {Retrieve ESI register from the stack}
     Pop   EBX               {Retrieve EBX register from the stack}

End;

我的8086+ CPU实模式例程通过半截面算法和 支持32位无符号整数的平方根;不是太快,但可能会优化:

Procedure _PosLongIMul2; Assembler;

{INPUT:

 DX:AX-> First factor (destroyed).
 BX:CX-> Second factor (destroyed).

 OUTPUT:

 BX:CX:DX:AX-> Multiplication result.

 TEMP:

 BP, Di, Si}

Asm

     Jmp   @Go

 @VR:DD    0      {COPY of RESULT     (LOW)}
     DD    0      {COPY of RESULT    (HIGH)}

 @Go:Push  BP

     Mov   BP,20H {32 Bit Op.}

     XOr   DI,DI  {COPY of first op.  (LOW)}
     XOr   SI,SI  {COPY of first op. (HIGH)}

     Mov   [CS:OffSet @VR  ],Word(0)
     Mov   [CS:OffSet @VR+2],Word(0)
     Mov   [CS:OffSet @VR+4],Word(0)
     Mov   [CS:OffSet @VR+6],Word(0)

 @01:ShR   BX,1
     RCR   CX,1

     JAE   @00

     Add   [CS:OffSet @VR  ],AX
     AdC   [CS:OffSet @VR+2],DX
     AdC   [CS:OffSet @VR+4],DI
     AdC   [CS:OffSet @VR+6],SI

 @00:ShL   AX,1
     RCL   DX,1
     RCL   DI,1
     RCL   SI,1

     Dec   BP
     JNE   @01

     Mov   AX,[CS:OffSet @VR]
     Mov   DX,[CS:OffSet @VR+2]
     Mov   CX,[CS:OffSet @VR+4]
     Mov   BX,[CS:OffSet @VR+6]

     Pop   BP

End;

Function _Sqrt:LongInt; Assembler;

{INPUT:

 Di:Si-> Unsigned integer input number X.

 OUTPUT:

 DX:AX-> Square root Y=Sqrt(X).

 TEMP:

 BX, CX}

Asm

     Jmp   @Go

 @Vr:DD    0 {+0  LOW}
     DD    0 {+4  HIGH}
     DD    0 {+8  Mid}
     DB    0 {+12 COUNT}

 @Go:Mov   [CS:OffSet @Vr],Word(0)
     Mov   [CS:OffSet @Vr+2],Word(0)

     Mov   [CS:OffSet @Vr+4],SI
     Mov   [CS:OffSet @Vr+6],DI

     Mov   [CS:OffSet @Vr+8],Word(0)
     Mov   [CS:OffSet @Vr+10],Word(0)

     Mov   [CS:OffSet @Vr+12],Byte(32)

 @02:Mov   AX,[CS:OffSet @Vr]
     Mov   DX,[CS:OffSet @Vr+2]

     Mov   CX,[CS:OffSet @Vr+4]
     Mov   BX,[CS:OffSet @Vr+6]

     Add   AX,CX
     AdC   DX,BX

     ShR   DX,1
     RCR   AX,1

     Mov   [CS:OffSet @Vr+8],AX
     Mov   [CS:OffSet @Vr+10],DX

     Mov   CX,AX
     Mov   BX,DX

     Push  DI
     Push  SI

     Call  _PosLongIMul2

     Pop   SI
     Pop   DI

     Or    BX,CX
     JNE   @00

     Cmp   DX,DI
     JA    @00
     JB    @01

     Cmp   AX,SI
     JA    @00

 @01:Mov   AX,[CS:OffSet @Vr+8]
     Mov   [CS:OffSet @Vr],AX

     Mov   DX,[CS:OffSet @Vr+10]
     Mov   [CS:OffSet @Vr+2],DX

     Dec   Byte Ptr [CS:OffSet @Vr+12]
     JNE   @02
     JE    @03

 @00:Mov   AX,[CS:OffSet @Vr+8]
     Mov   [CS:OffSet @Vr+4],AX

     Mov   DX,[CS:OffSet @Vr+10]
     Mov   [CS:OffSet @Vr+6],DX

     Dec   Byte Ptr [CS:OffSet @Vr+12]
     JNE   @02

 @03:Mov   AX,[CS:OffSet @Vr+8]
     Mov   DX,[CS:OffSet @Vr+10]

End;

这是我的8086+ CPU实模式例程返回一个32位定点数,它是16位无符号整数输入数的平方根;不是太快,但可能会优化:

Function _Sqrt2:LongInt; Assembler;

{INPUT:

 Si-> Unsigned integer number X (unaltered).

 OUTPUT:

 AX-> Integer part of Sqrt(X).
 DX-> Decimal part of Sqrt(X).

 TEMP:

 BX, CX, DX, Di}

Asm

     Jmp   @Go

 @Vr:DD    0 {+0  LOW}
     DD    0 {+4  HIGH}
     DD    0 {+8  Mid}
     DB    0 {+12 COUNT}

 @Go:Mov   [CS:OffSet @Vr],Word(0)
     Mov   [CS:OffSet @Vr+2],Word(0)

     Mov   [CS:OffSet @Vr+4],Word(0)
     Mov   [CS:OffSet @Vr+6],SI

     Mov   [CS:OffSet @Vr+8],Word(0)
     Mov   [CS:OffSet @Vr+10],Word(0)

     Mov   [CS:OffSet @Vr+12],Byte(32)

 @02:Mov   AX,[CS:OffSet @Vr]
     Mov   DX,[CS:OffSet @Vr+2]

     Mov   CX,[CS:OffSet @Vr+4]
     Mov   BX,[CS:OffSet @Vr+6]

     Add   AX,CX
     AdC   DX,BX

     ShR   DX,1
     RCR   AX,1

     Mov   [CS:OffSet @Vr+8],AX
     Mov   [CS:OffSet @Vr+10],DX

     Mov   CX,AX
     Mov   BX,DX

     Push  SI

     Call  _PosLongIMul2

     Pop   SI

     Or    BX,BX
     JNE   @00

     Cmp   CX,SI
     JB    @01
     JA    @00

     Or    DX,AX
     JNE   @00

 @01:Mov   AX,[CS:OffSet @Vr+8]
     Mov   [CS:OffSet @Vr],AX

     Mov   DX,[CS:OffSet @Vr+10]
     Mov   [CS:OffSet @Vr+2],DX

     Dec   Byte Ptr [CS:OffSet @Vr+12]
     JNE   @02
     JE    @03

 @00:Mov   AX,[CS:OffSet @Vr+8]
     Mov   [CS:OffSet @Vr+4],AX

     Mov   DX,[CS:OffSet @Vr+10]
     Mov   [CS:OffSet @Vr+6],DX

     Dec   Byte Ptr [CS:OffSet @Vr+12]
     JNE   @02

 @03:Mov   DX,[CS:OffSet @Vr+8]
     Mov   AX,[CS:OffSet @Vr+10]

End;

嗨!

答案 4 :(得分:0)

这是我原来的64位Square root例程:

// x:int64 (XL:XH)   
// sqrt64(X)
....
  asm      
   push ESI  {preserve ESI,EDI and EBX}
   push EDI
   push EBX

   mov ESI,XL      
   mov EDI,XH

   xor Eax,Eax
   xor Ebx,Ebx
   mov Ecx,32
@next:
   Add ESI,ESI   //old RCL ESI,1  -  Peter Cordes suggestion 
   ADC EDI,EDI   //old RCL EDI,1     ~1.38x faster! 
   ADC EBX,EBX   //old RCL Ebx,1     

   Add ESI,ESI   //old RCL ESI,1
   ADC EDI,EDI   //old RCL EDI,1
   ADC EBX,EBX   //old RCL Ebx,1

   mov EDX,EAX
   Add EDX,EDX   //old shl Edx,1
   stc
   ADC EDX,EDX   //old RCL Edx,1    {+01}

   cmp EBX,EDX   {if BX>=DX -> BX-DX}
  JC @skip
   sub EBX,EDX
@skip:
   cmc           {invert C}
   ADC EAX,EAX   //old  RCL Eax,1
  LOOP @next

   pop EBX
   pop EDI
   pop ESI

   mov result,Eax
 end;
....

答案 5 :(得分:0)

对于幼稚的素数测试循环条件,您可以使用试验部门的商来确定除数何时通过sqrt:TensorShape([Dimension(3600), Dimension(2)])

Checking if a number is prime in NASM Win64 Assembly /具有更多细节,包括x86 asm中的实现。无论如何,您都要进行该除法来检查if (x / d > d) break;,因此您可以免费获得商。

在64位代码中,常规整数除法可以是64位。 (但是remainder == 0的64位操作数大小比Intel上的32位慢很多,因此请尽可能使用32位。)