64x64 => 128 位，在 32 位模式下，使用一些 SSE2

Question

这是我用来实现两个32位数字的扩展乘法的代码。有没有办法通过制作子程序并通过参数传递使用堆栈来实现类似的逻辑？使用MUL指令还是没有指令？有人可以帮忙吗？

[org 0x0100]
jmp start

multiplicand: dd 123122,0
multiplier:   dd 66341
result:       dd 0,0

start:
initialize:   mov cl,32 

              mov bl,1
checkbit:     test bl,[multiplier]
              jz skip

multiply:     mov ax, [multiplicand]
              add [result],ax
              mov ax, [multiplicand+2]
              adc [result+2], ax
              mov ax, [multiplicand+4]
              adc [result+4], ax
              mov ax, [multiplicand+6] 
              adc [result+6], ax      

skip:         shl bl,1               
              shr word [multiplier+2],1 
              rcr word [multiplier],1 

              shl word [multiplicand],1 
              rcl word [multiplicand+2],1 
              rcl word [multiplicand+4],1 
              rcl word [multiplicand+6],1 
              dec cl
              jnz checkbit

              mov ax, 0x4c00
              int 0x21

Answer 1

[org 0x0100]
jmp start

multiplicand: dd 123122
multiplier:   dd 66341
result:       dd 0

start:
    push word [multiplier+2]
    push word [multiplier]
    push word [multiplicand+2]
    push word [multiplicand]
    call multiply
    add sp, 8            ; free arguments
    mov [result], ax     ; expect result in dx:ax
    mov [result+2], dx

    mov ax, 0x4c00
    int 0x21

multiply:
    push bp
    mov bp, sp

    mov ax, [bp+4]
    mul word [bp+8]      ; xl * yl
    mov cx, [bp+4]
    imul cx, [bp+10]     ; xl * yh
    add dx, cx
    mov cx, [bp+6]
    imul cx, [bp+8]      ; xh * yl
    add dx, cx

    mov sp, bp
    pop bp
    ret

目前尚不清楚是否需要64位结果，上述代码产生32位。

64位版本可能如下所示：

[org 0x0100]
jmp start

multiplicand: dd 123122
multiplier:   dd 66341
result:       dd 0, 0

start:
    push word [multiplier+2]
    push word [multiplier]
    push word [multiplicand+2]
    push word [multiplicand]
    push result           ; pointer for result
    call multiply
    add sp, 10            ; free arguments

    mov ax, 0x4c00
    int 0x21

multiply:
    push bp
    mov bp, sp
    push bx

    mov bx, [bp+4]       ; result

    mov ax, [bp+6]
    mul word [bp+10]     ; xl * yl
    mov [bx], ax         ; r0
    mov [bx+2], dx       ; r1

    mov ax, [bp+6]
    mul word [bp+12]     ; xl * yh
    add [bx+2], ax       ; r1
    adc dx, 0
    mov [bx+4], dx       ; r2

    mov ax, [bp+8]
    mul word [bp+10]     ; xh * yl
    add [bx+2], ax
    adc [bx+4], dx

    mov ax, [bp+8]
    mul word [bp+12]     ; xh * yh
    add [bx+4], ax       ; r2
    adc dx, 0
    mov [bx+6], dx       ; r3

    mov ax, bx           ; return result
    pop bx
    mov sp, bp
    pop bp
    ret

免责声明：我刚刚向后移植了通常的32位约定，其中一个额外的隐藏参数用于指向调用者保留的结果位置，该指针也会返回。这段代码有效，但不知道16位编译器是否真的使用了这种约定。

Answer 2

我想，您的问题是SP缺少算术函数，例如[sp + 4]。您可以改用BP。在您自己的汇编函数中，您可以自由地传递参数和结果。我将向您展示一种通过堆栈传递参数并在堆栈上获取结果的方法：

BITS 16
ORG 0x0100

jmp start

multiplicand: dd 123122,0                   ; 0102 0x0001E0F2 -> 0x00000000
                                            ; 0106 0x00000000 -> 0x0001E0F2
multiplier:   dd 66341                      ; 010A 0x00010325 -> 0x00000000
result:       dd 0,0                        ; 010E 0x00000000 -> 0x0023B1F6
                                            ; 0112 0x00000000 -> 0x00000000

start:
            push word [multiplicand + 6]    ; bp + 22
            push word [multiplicand + 4]    ; bp + 20
            push word [multiplicand + 2]    ; bp + 18
            push word [multiplicand + 0]    ; bp + 16

            push word [multiplier + 2]      ; bp + 14
            push word [multiplier + 0]      ; bp + 12

            push word [result + 6]          ; bp + 10
            push word [result + 4]          ; bp + 8
            push word [result + 2]          ; bp + 6
            push word [result + 0]          ; bp + 4

            call sub_mul

            pop word [result + 0]           ; Pop stack into `result`
            pop word [result + 2]
            pop word [result + 4]
            pop word [result + 6]
            add sp, 12                      ; Clean up the rest of the stack                        ;

            mov ax, 0x4c00
            int 0x21

sub_mul:
            push bp                         ; Prolog
            mov bp, sp

initialize:   mov cl,32

              mov bl,1
checkbit:     test bl,[bp + 12]
              jz skip

multiply:     mov ax, [bp + 16]
              add [bp + 4],ax
              mov ax, [bp + 18]
              adc [bp + 6], ax
              mov ax, [bp + 20]
              adc [bp + 8], ax
              mov ax, [bp + 22]
              adc [bp + 10], ax

skip:         shl bl,1
              shr word [bp + 14],1
              rcr word [bp + 12],1

              shl word [bp + 16],1
              rcl word [bp + 18],1
              rcl word [bp + 20],1
              rcl word [bp + 22],1
              dec cl
              jnz checkbit

        leave                           ; Epilog
        ret

Answer 3

正如 Jester 指出的那样，您可以使用 4x mul 指令进行 32x32 => 64 位乘法，并使用适当的 add/adc 将部分乘积添加到 64 位结果中。（如果您没有对现有的非零 64 位值进行乘法累加，则在进位无法传播那么远的情况下，可以优化掉一些 adc [result+6], 0。）

您需要所有 4 个半组合的产品，
low * low、low * high /high * low 交叉产品和 high * high。（如果您只想要结果的低 32 位，与输入的宽度相同，则不需要 high * high 部分：那将完全位于高半部分。）

（在更现代的 x86-64 上，有关使用 64 位块的相同内容，请参阅 https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf，以及如何使用 mulx（保持 CF 不变）以及 ADOX/ADCX 提高效率对于具有 2 个以上输入块的大型产品变得有用。例如，512 位 x 512 位使用 64 位块又名四肢。白皮书提供了部分产品如何与重叠对齐的有用图表。）

Jester 的答案针对可读性进行了优化，每次乘法后都会将部分乘积存储/添加到内存中的结果中。

这是尝试将它们保存在寄存器中，并避免一些冗余加载。（尽管其中一些可能会以额外的机器代码大小为代价，这在原始 8086 上，尤其是 8088 上是昂贵的，除非在缓慢的 mul 指令的阴影下预取它。）我也进行了试验使用 stosw 存储结果并增加输出指针 (ES:DI)。我假设一个扁平/小内存模型，其中 DS=ES。另一种选择是将结果存储在堆栈上的输入参数上，如果您将存储延迟到每个输入的最后一次加载之后。

如果不需要保存/恢复寄存器，使用更多寄存器会变得更有吸引力。我决定让 BX 被这个版本的函数销毁，并取一个已经在 DI 中的结果指针。（寄存器arg）。在纯汇编语言中，使用自定义调用约定是有意义的，尤其是对于相对较小的辅助函数；显然可以添加更多的推送/弹出来保存/恢复这段代码使用的更多寄存器。

将第三个 add 的 adc/mul 工作延迟到最后一个 mul 之后似乎是一个很好的优化。

优化版本（63 字节：8088 / 8086 上的速度取决于大小）

我还尝试在 mul 之后安排快速指令（无内存访问），以清空将在非常慢的 mul 期间填满的预取缓冲区。（假设我们正在针对实际 8088 进行调整，其中总成本 ~= 存储/加载的总字节数之和，包括代码提取，除了像 mul 这样的慢指令需要额外的时间，并让代码提取有机会填满 4字节缓冲区。）

;;; inputs:  result pointer in ES/DS:DI (unmodified on return), DS=ES assumed.
;;;          dword X and Y on the stack.
;;;          (For ease of comparison / adaptation, I left an unused word below them, where the result pointer was passed before)
      ;;to only ever use ES:DI, use an ES prefix on the [DI] addr modes, or just avoid STOSW and use (DS:) [DI+0..6] addressing.
;;; outputs: qword [ES:DI] = X * Y
;;; clobbers: AX, CX, DX,  BX.
;;;           SI saved/restored, DI restored.
;;; precondition: DF=0  (cld) assumed as part of the calling convention
;%define bp ebp
;%define di edi
;%define sp esp

multiply_v2:
    push  bp
    mov   bp, sp
    push  si
;%define bp ebp+2      ; saved-EBP is wider than saved-BP

    ;mov   di, [bp+4]       ; result pointer.  Instead, caller passes it

    mov   ax, [bp+6]       ; xl
    mov   si, ax           ; xl
    mov   cx, [bp+10]      ; yl

    MUL   cx               ;; xl * yl
    mov   bx, dx           ; r1   ; eventually store to [di-2 + 2]
    stosw                  ; r0;  ; mov  [di], ax  /  add di, 2

    mov   ax, [bp+8]       ; xh
    MUL   cx               ;; xh * yl
    xor   cx, cx
    add   ax, bx
    stosw                       ; r1 in mem, BX free, DI pointing at result+4 = r2
    adc   cx, dx                ; r2 in CX   (adc dx,0 / mov cx,dx)
    ; carry into r3 is impossible here: 0xffff^2 has a high half of 0xfffe

    mov   ax, [bp+12]      ; yh
    xchg  si, ax             ; save yh; we're done with xl.  (xchg-with-AX saves a byte vs. mov: worth it on 8088)
    MUL   si               ;; xl * yh
    mov   bx, dx             ; save xlyh_hi (r2)
    xchg  si, ax             ; save xlyh_lo (r1), get yh into AX for next mul

    MUL   word [bp+8]      ;; xh * yh  => r3:r2
    add   bx, cx           ; r2: combine partial products from r2:r1 cross-multiplies
    adc   dx, 0            ; r3: which can carry-out, into a real r3 instead of needing to materialize a 0 / 1

    add   [di-4 + 2], si   ; r1  at result+2, from 2nd cross-multiply
    adc   ax, bx           ; r2, *can* carry farther
    adc   dx, 0            ; r3

    stosw                  ; r2
    mov   [di], dx         ; r3 at [result+6] = [di-6 + 6]

    pop   si
    sub   di, 6            ; 3 bytes.  Still pays for itself in size savings of earlier stosw instructions vs. mov [di+4], on 8086/8088

;%define bp ebp
    ; mov sp, bp   ; redundant
    pop   bp
    ret

注释掉的 %define 行来自 Linux 下 32 位模式下的测试，使用以下 C 调用程序。（它有效，包括一些极端情况，如 0xFFFFFFFF 平方）。 arg 空间未使用的低 2 字节（DI 寄存器 arg 的“home 空间”？）在 32 位模式下由返回地址的顶部填充，但 push ebp 也是 4 字节，而不是 2，所以偏移量必须改变。

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

__attribute__((regparm(1)))
extern void multest_32b(uint64_t *result, uint32_t x, uint32_t y);

int main(int argc, char **argv) {
    unsigned x = 0x12345, y = 0xffbcde98;
    if (argc>=3) {
        x = strtoll(argv[1], NULL, 0);
        y = strtoll(argv[2], NULL, 0);
    }
    uint64_t test;
    multest_32b(&test, x,y);
    printf ("%#x * %#x = %#llx.  (test ^ control = %#llx)\n", x, y, test, test ^ (x * (uint64_t)y)  );
}

64 位异或结果在它们匹配时当然是 0，而 GCC 知道如何用一条 mul 指令进行 32x32 => 64 位乘法。

我使用了一个包装函数来适应调用约定的差异。我希望我可以在不复制堆栈参数的情况下逃脱，所以我使用 XMM0 来保存/恢复 EDI，当我意识到这行不通时没有改变它。我也让它破坏了调用者的 BX，但似乎 GCC 制作的代码不依赖于 EBX，即使它是 i386 System V 调用约定中的调用保留寄存器。我可能刚刚完成了 mov edi, eax / jmp，因为这个 main 可能也不使用 EDI。

;; wrapper function adapter for 32-bit C callers
global multest_32b              ; __attribute__((regparm(1)))
multest_32b:
    movd  xmm0, edi
    mov   edi, eax
    push  dword [esp+8]    ; bleh, can't tail-call if we want to restore (E)DI
    push  dword [esp+8]
    call  multiply_v2
    movd  edi, xmm0
    add   esp, 8
    ret

$ nasm -felf32 mul-ext.asm && gcc -m32 mul-test.c mul-ext.o
$ ./a.out
$ ./a.out 0x7feffeff 0xffeffeff 
0x7feffeff * 0xffeffeff = 0x7fe7ff7ea0210201.  (test ^ control = 0)
$ ./a.out 0xffffffff 0xffffffff 
0xffffffff * 0xffffffff = 0xfffffffe00000001.  (test ^ control = 0)

$ ./a.out 
0x12345 * 0xffbcde98 = 0x122f89eeec6f8.  (test ^ control = 0)
$ while ./a.out $(( $RANDOM * $RANDOM )) $(( $RANDOM * $RANDOM )) ;do :;done | fgrep -v 'control = 0)'
 ... runs until you ^C with no output, i.e. no error

64x64 => 128 位，在 32 位模式下，使用一些 SSE2

我认为如果你用 SSE2 pmuludq 做两个部分积，看看它是什么样子会很有趣。此版本未经测试。 GCC 在 64 位目标上有一个 unsigned __int128，因此可以将相同的测试策略与从寄存器 args 改编为堆栈 args 的包装函数一起使用。

第 1 版使用标量进行交叉积，另外两个使用 SIMD，使用向量存储/标量重新加载而不是一堆洗牌，因为我们想用标量接触 4 个元素中的 3 个，以及低 dword那家商店的东西已经完成了。

未经测试，i386 System V 调用约定。（纯堆栈参数、EAX、ECX、EDX 和 XMM regs 调用破坏）。

global multiply_64x64_128_SSE2
multiply_64x64_128_SSE2:             ; (uint64_t x, uint64_t y,  uint128 *result)  stack args
    push  ebx
%define ARGS esp+4
    mov   eax,  [ARGS + 0]    ; Xl
    MUL   dword [ARGS + 12]   ;; Xl * Yh       ; first cross product

     ;; assume aligned args, like Linux's version of i386 System V
    pshufd  xmm0, [ARGS + 0], 0b11_01_10_00    ; [ Yh, Xh, Yl, Xl ]
    pshufd  xmm1, xmm0, 0b1111_0101            ; [ Yh, Yh, Yl, Yl ]
    pmuludq xmm0, xmm1                         ; [   Xh*Yh,   Xl*Yl ]  hi * hi and low x low 64-bit halves where they should be in result.

    mov   ebx, eax              ; r1
    mov   ecx, edx              ; r2
    mov   eax, [ARGS + 4]       ; Xh
    MUL   dword [ARGS + 12]     ;; Xh * Yh    r2:r1
    add   eax, ebx              ; r1
    adc   edx, ecx              ; r2
    mov   ecx, 0                      ; can't xor-zero to avoid partial-register stalls unless we had another free register. setc / movzx is better on P6-family, false dep otherwise.
                                      ; or mov ecx, [ebx+12] / adc ecx,0   now instead of memory-source adc later: good on SnB-family, except store-forwarding neds to be ready sooner.
    setc  cl                    ; r3 carry-out from r2 materialized without reloading SIMD result yet

    mov   ebx, [ARGS+16]        ; uint128 *result
    movdqu  [ebx], xmm0         ; vector store, then accumulate cross products into it.

    add   [ebx+4], eax          ; r1
    adc   edx, [ebx+8]          ; r2
     mov   [ebx+8], edx             ; memory-destination ADC is inefficient on Intel
    adc   ecx, [ebx+12]         ; r3
     mov   [ebx+12], ecx
    pop   ebx
    ret

备用 SSE2 版本：对包含 Yl 的两种产品使用 SIMD。与早期版本相比，需要一个额外的空闲寄存器，并在关键路径中更早地重新加载 SIMD 存储。但确实节省了 1 个标量负载。

global multiply_64x64_128_SSE2_v2
multiply_64x64_128_SSE2_v2:             ; (uint64_t x, uint64_t y,  uint128 *result)  stack args
    push  ebx
    push  edi
%define ARGS esp+8
    mov   eax,  [ARGS + 12]    ; Yh
    mov   edi,  eax
    MUL   dword [ARGS + 0]     ;; Xl * Yh     r2:r1 cross product

     ;; assume aligned args, like Linux's version of i386 System V
    movdqu  xmm0, [ARGS + 0]                   ; [ Yh, Yl, Xh, Xl ]
        ;  pshufd  xmm0, [ARGS + 0], 0b11'01'10'00    ; [ Yh, Xh, Yl, Xl ]
    pshufd  xmm1, xmm0, 0b11_01_00_10          ; [ Yh, Xh, Xl, Yl ]      ; do both partial products involving Yl, using only 1 shuffle
    pmuludq xmm0, xmm1                         ; [   Xh*Yl(r2:r1),   Xl*Yl (r1:r0)]   ; low dword fully finished, and one cross product out of place

    mov   ebx, eax              ; r1
    mov   eax, [ARGS + 4]               ; Xh.  mul [mem] micro-fuses on Intel SnB-family, so this is equal to mov eax,edi / mul [mem] only needing 1 free reg.  But we need more later.
    mov   ecx, edx              ; r2
    MUL   edi                   ;; Xh * Yh    r3:r2

     mov   edi, [ARGS+16]        ; uint128 *result
     movdqu  [edi], xmm0         ; vector store, then accumulate partial products into it.

    add   ebx, [edi+8]          ; r1   (from SIMD cross product)
    adc   eax, [edi+12]         ; r2
    adc   edx, 0

    add   [edi+4], ebx          ; r1   (from SIMD low * low)
    adc   eax, ecx              ; r2
     mov   [edi+8], eax             ; memory-destination ADC is inefficient on Intel
    adc   edx, 0                ; r3
     mov   [edi+12], edx

    pop   ebx
    ret

adc 在 Broadwell（或 Sandybridge for adc reg,0）之前在 Intel 上是 2 uops，所以我尝试在它之后安排其他指令以获得更好的解码器吞吐量。但任一版本都可以以牺牲可读性为代价进行更积极的调整（分散更多相关操作，而不是将它们分组）。

它们也没有真正针对 P6 系列进行调整，其中部分寄存器停顿是一个问题。

MULX 对于有两个显式输出而不是隐式 EDX:EAX 真的很好，就像不接触 FLAGS 一样。将输入保持在同一个寄存器中并将几个不同的东西乘以它到不同的输出寄存器中将节省大量的 mov 指令。我真的明白他们为什么这样设计，而不是让输出之一是隐式操作数。

通过堆栈进行32位扩展乘法

3 个答案:

优化版本（63 字节：8088 / 8086 上的速度取决于大小）

64x64 => 128 位，在 32 位模式下，使用一些 SSE2