NASM - 使用标签作为阵列偏移

时间:2015-07-16 06:26:50

标签: arrays assembly nasm x86-64


%macro prologue 0
    push    rbp
    mov     rbp,rsp
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
%macro epilogue 0
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbx

segment .data
    offset  db  1
segment .bss
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resd    1
segment .text
    global  avgArray 

    mov [a1], rdi
    mov [a2], rsi
    mov [avg], rdx
    mov [avgL], rcx

    mov rsi, [a1]
    mov r9, [a2]
    mov rdi, [avg]

    mov rcx, rsi
    add rcx, [avgL]    ; array length

    xor rdx, rdx
    xor rax, rax
    xor rbx, rbx
    mov al, [rsi]
    mov dl, [r9]
    add ax, dx
    shr ax, 1
    mov [rdi], al

    add rsi, [offset]
    add r9, [offset]
    add rdi, [offset]

    cmp rsi, rcx
    jb  avgArray_loop

[offset]替换为1时,它的效果非常好。但是,当使用[offset]确定下一个数组元素时,它似乎不会将其值添加到rsirdir9。 我已经使用gdb检查了它。调用rsi后,add rsi, [offset]中存储的地址仍然相同。


顺便说一句:Linux x86_64机器

segment .data
    offset  db  1
segment .bss
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resq    1

%macro prologue 0
    push    rbp
    mov     rbp,rsp   ; you can drop this and the LEAVE.
;  Stack frames were useful before debuggers could keep track of things without them, and as a convenience
;  so local variables were always at the same offset from your base pointer, even while you were pushing/popping stuff on the stack.
; With the SysV ABI, you can use the red zone for locals without even
; fiddling with RSP at all, if you don't push/pop or call anything.
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
%macro epilogue 0
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbx

segment .data
    offset  db  1
segment .bss    ; These should really be locals on the stack (or in regs!), not globals
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resd    1

segment .text
; usually a comment with a C function prototype and description is a good idea for functions
    global  avgArray

    mov [a1], rdi     ; what is this sillyness?  you have 16 registers for a reason.
    mov [a2], rsi     ; shuffling the values you want into the regs you want them in
    mov [avg], rdx    ; is best done with reg-reg moves.
    mov [avgL], rcx   ; I like to just put a comment at the top of a block of code
                      ; to document what goes in what reg.

    mov rsi, [a1]
    mov r9, [a2]
    mov rdi, [avg]

    mov rcx, rsi
    add rcx, [avgL]    ; This could be lea rcx, [rsi+rcx]
              ;  (since avgL is in rcx anyway as a function arg).

    xor rdx, rdx
    xor rax, rax
    xor rbx, rbx
avgArray_loop:   ; you can use a local label here, starting with a .
 ; You don't need a diff name for each loop: the assembler will branch to the most recent instance of that label
    mov al, [rsi]        ; there's a data dependency on the old value of ax
    mov dl, [r9]         ; since the CPU doesn't "know" that shr ax, 1 will always leave ah zeroed in this algorithm

    add ax, dx           ; Avoid ALU ops on 16bit regs whenever possible.  (8bit is fine, they have diff opcodes instead of a prefix)
                         ; to avoid decode stalls on Intel
    shr ax, 1            ; Better to use 32bit regs (movsx/movzx)
    mov [rdi], al

    add rsi, [offset]    ; These are 64bit adds, so you're reading 7 bytes after the 1 you set with db.
    add r9, [offset]
    add rdi, [offset]

    cmp rsi, rcx
    jb  avgArray_loop


此外,1-reg addressing modes are only more efficient when used as mem operands for ALU ops。只需增加一个计数器,并在有大量指针时使用base + offset * scale寻址(除非你展开循环),尤其是。如果您使用mov加载它们。



; no storage needed
segment .text
GLOBAL  avgArray
    ; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
    ; if you can choose your prototype, do it so args go where you want them anyway.
    ; prologue
    ; rdi = avg
    ; rsi = a1
    ; rdx = a2
    ; rcx = len

    ; mov    [rsp-8], rcx    ; if I wanted to spill  len  to memory

    add    rcx, rdi
    add    rcx, rsi
    add    rcx, rdx
    neg    rcx       ; now [rdi+rcx] is the start of dest, and we can count rcx upwards towards zero.
    ; We could also have just counted down towards zero
    ; but HW memory prefetchers have more stream slots for forward patterns than reverse.

    ;  use movsx for signed char
    movzx  eax, [rsi+rcx]     ; dependency-breaker
    movzx  r8d, [rdx+rcx]     ; Using r8d to save push/pop of rbx
           ; on pre-Nehalem where insn decode can be a bottleneck even in tight loops
           ; using ebx or ebp would save a REX prefix (1 insn byte).
    add    eax, r8d
    shr    eax, 1
    mov    [rdi+rcx], al

    inc    rcx     ; No cmp needed: this is the point of counting up towards zero
    jl     .loop   ; inc/jl can Macro-fuse into one uop

    ; nothing to pop, we only used caller-saved regs.

在Intel上,循环是7 uops,(存储是2 uops:存储地址和存储数据,并且不能微熔合),因此每个周期可以发出4 uop的CPU将在2个周期完成每字节。 movzx(到32或64位reg)无论如何都是1 uop,因为没有端口0/1/5 uop用于微熔合与否。 (这是一个读取,而不是读取 - 修改)。

7 uops需要2个最多4个uop的块,因此循环可以在2个周期内发出。没有其他瓶颈可以阻止执行单元跟上它,所以它应该每2个周期运行一次。


有一个向量指令可以完成这个操作:PAVGB包含大量无符号字节(带有9位临时值以避免溢出,与add / shr相同)。

; no storage needed
segment .text
GLOBAL  avgArray
    ; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
    ; rdi = avg
    ; rsi = a1
    ; rdx = a2
    ; rcx = len

; same setup
; TODO: scalar loop here until [rdx+rcx] is aligned.
    ;  use movsx for signed char
    movdqu    xmm0, [rsi+rcx]    ; 1 uop
    pavgb     xmm0, [rdx+rcx]    ; 2 uops (no micro-fusion)
    movdqu    [rdi+rcx], xmm0    ; 2 uops: no micro-fusion

    add    rcx, 16
    jl     .loop          ; 1 macro-fused uop add/branch
    ; TODO: scalar cleanup.


再次,每次迭代6个uop / 2个循环,但每次迭代将执行16个字节。展开是理想的,因此你的循环是4 uop的倍数,所以你不会在循环结束时以低于4 uop的周期丢失发行率。每个周期2个加载/ 1个存储是我们的瓶颈,因为PAVGB每个周期的吞吐量为2。

在Haswell及以后,16B /周期应该不难。使用ymm寄存器的AVX2,您将获得32B /周期。 (SnB / IvB每个周期只能执行两次内存操作,其中一个只能存储一个,除非你使用256b加载/存储)。无论如何,在这一点上你已经从矢量化中获得了16倍的加速,通常这已经足够了。我只是喜欢通过计算uops和展开来调整理论最大吞吐量。 :)

如果要打开循环,那么增加指针而不仅仅是索引是值得的。 (因此,[rdx]和一个add会有两种用途,而[rdx + rcx]则有两种用途。
