Question

我正在尝试在汇编程序中编写一个小程序，它将三个char数组作为输入，计算第一个数组中每个元素的平均值，并将结果存储在第三个数组中，如下所示。

%macro prologue 0
    push    rbp
    mov     rbp,rsp
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
%endmacro
%macro epilogue 0
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbx
    leave
    ret
%endmacro

segment .data
    offset  db  1
segment .bss
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resd    1
segment .text
    global  avgArray 
avgArray:
    prologue

    mov [a1], rdi
    mov [a2], rsi
    mov [avg], rdx
    mov [avgL], rcx

    mov rsi, [a1]
    mov r9, [a2]
    mov rdi, [avg]

    mov rcx, rsi
    add rcx, [avgL]    ; array length

    xor rdx, rdx
    xor rax, rax
    xor rbx, rbx
avgArray_loop:
    mov al, [rsi]
    mov dl, [r9]
    add ax, dx
    shr ax, 1
    mov [rdi], al

    add rsi, [offset]
    add r9, [offset]
    add rdi, [offset]

    cmp rsi, rcx
    jb  avgArray_loop
    epilogue

将[offset]替换为1时，它的效果非常好。但是，当使用[offset]确定下一个数组元素时，它似乎不会将其值添加到rsi，rdi和r9。我已经使用gdb检查了它。调用rsi后，add rsi, [offset]中存储的地址仍然相同。

有人可以告诉我为什么使用[offset]不会工作但添加简单的1会吗？

顺便说一句：Linux x86_64机器

Answer 1

所以我找到了解决这个问题的方法。

avgL和offset的地址直接存放在彼此之后。从rcx读取并将其存储到avgL时，它也覆盖了offset的值。将avgL声明为QWORD而不是DWORD会阻止mov覆盖offset数据。

新数据和bss细分如下所示

segment .data
    offset  db  1
segment .bss
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resq    1

Answer 2

自己调试问题的好工作。由于我已经开始查看代码，我会给你一些效率/风格评论作为补充评论：

%macro prologue 0
    push    rbp
    mov     rbp,rsp   ; you can drop this and the LEAVE.
;  Stack frames were useful before debuggers could keep track of things without them, and as a convenience
;  so local variables were always at the same offset from your base pointer, even while you were pushing/popping stuff on the stack.
; With the SysV ABI, you can use the red zone for locals without even
; fiddling with RSP at all, if you don't push/pop or call anything.
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
%endmacro
%macro epilogue 0
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbx
    leave
    ret
%endmacro

segment .data
    offset  db  1
segment .bss    ; These should really be locals on the stack (or in regs!), not globals
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resd    1

segment .text
; usually a comment with a C function prototype and description is a good idea for functions
    global  avgArray
avgArray:
    prologue

    mov [a1], rdi     ; what is this sillyness?  you have 16 registers for a reason.
    mov [a2], rsi     ; shuffling the values you want into the regs you want them in
    mov [avg], rdx    ; is best done with reg-reg moves.
    mov [avgL], rcx   ; I like to just put a comment at the top of a block of code
                      ; to document what goes in what reg.

    mov rsi, [a1]
    mov r9, [a2]
    mov rdi, [avg]

    mov rcx, rsi
    add rcx, [avgL]    ; This could be lea rcx, [rsi+rcx]
              ;  (since avgL is in rcx anyway as a function arg).

    xor rdx, rdx
    xor rax, rax
    xor rbx, rbx
avgArray_loop:   ; you can use a local label here, starting with a .
 ; You don't need a diff name for each loop: the assembler will branch to the most recent instance of that label
    mov al, [rsi]        ; there's a data dependency on the old value of ax
    mov dl, [r9]         ; since the CPU doesn't "know" that shr ax, 1 will always leave ah zeroed in this algorithm

    add ax, dx           ; Avoid ALU ops on 16bit regs whenever possible.  (8bit is fine, they have diff opcodes instead of a prefix)
                         ; to avoid decode stalls on Intel
    shr ax, 1            ; Better to use 32bit regs (movsx/movzx)
    mov [rdi], al

    add rsi, [offset]    ; These are 64bit adds, so you're reading 7 bytes after the 1 you set with db.
    add r9, [offset]
    add rdi, [offset]

    cmp rsi, rcx
    jb  avgArray_loop
    epilogue

你有大量的寄存器，你为什么要在内存中保持循环增量？我希望它在调试/尝试时能以这种方式结束。

此外，1-reg addressing modes are only more efficient when used as mem operands for ALU ops。只需增加一个计数器，并在有大量指针时使用base + offset * scale寻址（除非你展开循环），尤其是。如果您使用mov加载它们。

以下是我的工作方式（使用英特尔SnB及更高版本的性能分析）：

标量

; no storage needed
segment .text
GLOBAL  avgArray
avgArray:
    ; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
    ; if you can choose your prototype, do it so args go where you want them anyway.
    ; prologue
    ; rdi = avg
    ; rsi = a1
    ; rdx = a2
    ; rcx = len

    ; mov    [rsp-8], rcx    ; if I wanted to spill  len  to memory

    add    rcx, rdi
    add    rcx, rsi
    add    rcx, rdx
    neg    rcx       ; now [rdi+rcx] is the start of dest, and we can count rcx upwards towards zero.
    ; We could also have just counted down towards zero
    ; but HW memory prefetchers have more stream slots for forward patterns than reverse.

ALIGN 16
.loop:
    ;  use movsx for signed char
    movzx  eax, [rsi+rcx]     ; dependency-breaker
    movzx  r8d, [rdx+rcx]     ; Using r8d to save push/pop of rbx
           ; on pre-Nehalem where insn decode can be a bottleneck even in tight loops
           ; using ebx or ebp would save a REX prefix (1 insn byte).
    add    eax, r8d
    shr    eax, 1
    mov    [rdi+rcx], al

    inc    rcx     ; No cmp needed: this is the point of counting up towards zero
    jl     .loop   ; inc/jl can Macro-fuse into one uop

    ; nothing to pop, we only used caller-saved regs.
    ret

在Intel上，循环是7 uops，（存储是2 uops：存储地址和存储数据，并且不能微熔合），因此每个周期可以发出4 uop的CPU将在2个周期完成每字节。 movzx（到32或64位reg）无论如何都是1 uop，因为没有端口0/1/5 uop用于微熔合与否。（这是一个读取，而不是读取 - 修改）。

7 uops需要2个最多4个uop的块，因此循环可以在2个周期内发出。没有其他瓶颈可以阻止执行单元跟上它，所以它应该每2个周期运行一次。

矢量

有一个向量指令可以完成这个操作：PAVGB包含大量无符号字节（带有9位临时值以避免溢出，与add / shr相同）。

; no storage needed
segment .text
GLOBAL  avgArray
avgArray:
    ; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
    ; rdi = avg
    ; rsi = a1
    ; rdx = a2
    ; rcx = len

; same setup
; TODO: scalar loop here until [rdx+rcx] is aligned.
ALIGN 16
.loop:
    ;  use movsx for signed char
    movdqu    xmm0, [rsi+rcx]    ; 1 uop
    pavgb     xmm0, [rdx+rcx]    ; 2 uops (no micro-fusion)
    movdqu    [rdi+rcx], xmm0    ; 2 uops: no micro-fusion

    add    rcx, 16
    jl     .loop          ; 1 macro-fused uop add/branch
    ; TODO: scalar cleanup.
    ret

使循环退出条件正确是很棘手的，因为如果下一个16B离开数组的末尾，你需要结束向量循环。概率。最好通过将rcx减去15或其他东西来处理它，然后再将它添加到指针中。

再次，每次迭代6个uop / 2个循环，但每次迭代将执行16个字节。展开是理想的，因此你的循环是4 uop的倍数，所以你不会在循环结束时以低于4 uop的周期丢失发行率。每个周期2个加载/ 1个存储是我们的瓶颈，因为PAVGB每个周期的吞吐量为2。

在Haswell及以后，16B /周期应该不难。使用ymm寄存器的AVX2，您将获得32B /周期。（SnB / IvB每个周期只能执行两次内存操作，其中一个只能存储一个，除非你使用256b加载/存储）。无论如何，在这一点上你已经从矢量化中获得了16倍的加速，通常这已经足够了。我只是喜欢通过计算uops和展开来调整理论最大吞吐量。：）

如果要打开循环，那么增加指针而不仅仅是索引是值得的。（因此，[rdx]和一个add会有两种用途，而[rdx + rcx]则有两种用途。

无论哪种方式，清理循环设置并将所有内容保存在寄存器中都可以节省大量的指令字节和短数组的开销。

NASM - 使用标签作为阵列偏移

2 个答案:

标量

矢量