我正在尝试在汇编程序中编写一个小程序,它将三个char
数组作为输入,计算第一个数组中每个元素的平均值,并将结果存储在第三个数组中,如下所示。
%macro prologue 0
push rbp
mov rbp,rsp
push rbx
push r12
push r13
push r14
push r15
%endmacro
%macro epilogue 0
pop r15
pop r14
pop r13
pop r12
pop rbx
leave
ret
%endmacro
segment .data
offset db 1
segment .bss
a1 resq 1
a2 resq 1
avg resq 1
avgL resd 1
segment .text
global avgArray
avgArray:
prologue
mov [a1], rdi
mov [a2], rsi
mov [avg], rdx
mov [avgL], rcx
mov rsi, [a1]
mov r9, [a2]
mov rdi, [avg]
mov rcx, rsi
add rcx, [avgL] ; array length
xor rdx, rdx
xor rax, rax
xor rbx, rbx
avgArray_loop:
mov al, [rsi]
mov dl, [r9]
add ax, dx
shr ax, 1
mov [rdi], al
add rsi, [offset]
add r9, [offset]
add rdi, [offset]
cmp rsi, rcx
jb avgArray_loop
epilogue
将[offset]
替换为1
时,它的效果非常好。但是,当使用[offset]
确定下一个数组元素时,它似乎不会将其值添加到rsi
,rdi
和r9
。
我已经使用gdb检查了它。调用rsi
后,add rsi, [offset]
中存储的地址仍然相同。
有人可以告诉我为什么使用[offset]
不会工作但添加简单的1会吗?
顺便说一句:Linux x86_64机器
答案 0 :(得分:3)
所以我找到了解决这个问题的方法。
avgL
和offset
的地址直接存放在彼此之后。从rcx
读取并将其存储到avgL
时,它也覆盖了offset
的值。将avgL
声明为QWORD而不是DWORD会阻止mov
覆盖offset
数据。
新数据和bss细分如下所示
segment .data
offset db 1
segment .bss
a1 resq 1
a2 resq 1
avg resq 1
avgL resq 1
答案 1 :(得分:1)
自己调试问题的好工作。由于我已经开始查看代码,我会给你一些效率/风格评论作为补充评论:
%macro prologue 0
push rbp
mov rbp,rsp ; you can drop this and the LEAVE.
; Stack frames were useful before debuggers could keep track of things without them, and as a convenience
; so local variables were always at the same offset from your base pointer, even while you were pushing/popping stuff on the stack.
; With the SysV ABI, you can use the red zone for locals without even
; fiddling with RSP at all, if you don't push/pop or call anything.
push rbx
push r12
push r13
push r14
push r15
%endmacro
%macro epilogue 0
pop r15
pop r14
pop r13
pop r12
pop rbx
leave
ret
%endmacro
segment .data
offset db 1
segment .bss ; These should really be locals on the stack (or in regs!), not globals
a1 resq 1
a2 resq 1
avg resq 1
avgL resd 1
segment .text
; usually a comment with a C function prototype and description is a good idea for functions
global avgArray
avgArray:
prologue
mov [a1], rdi ; what is this sillyness? you have 16 registers for a reason.
mov [a2], rsi ; shuffling the values you want into the regs you want them in
mov [avg], rdx ; is best done with reg-reg moves.
mov [avgL], rcx ; I like to just put a comment at the top of a block of code
; to document what goes in what reg.
mov rsi, [a1]
mov r9, [a2]
mov rdi, [avg]
mov rcx, rsi
add rcx, [avgL] ; This could be lea rcx, [rsi+rcx]
; (since avgL is in rcx anyway as a function arg).
xor rdx, rdx
xor rax, rax
xor rbx, rbx
avgArray_loop: ; you can use a local label here, starting with a .
; You don't need a diff name for each loop: the assembler will branch to the most recent instance of that label
mov al, [rsi] ; there's a data dependency on the old value of ax
mov dl, [r9] ; since the CPU doesn't "know" that shr ax, 1 will always leave ah zeroed in this algorithm
add ax, dx ; Avoid ALU ops on 16bit regs whenever possible. (8bit is fine, they have diff opcodes instead of a prefix)
; to avoid decode stalls on Intel
shr ax, 1 ; Better to use 32bit regs (movsx/movzx)
mov [rdi], al
add rsi, [offset] ; These are 64bit adds, so you're reading 7 bytes after the 1 you set with db.
add r9, [offset]
add rdi, [offset]
cmp rsi, rcx
jb avgArray_loop
epilogue
你有大量的寄存器,你为什么要在内存中保持循环增量?我希望它在调试/尝试时能以这种方式结束。
此外,1-reg addressing modes are only more efficient when used as mem operands for ALU ops。只需增加一个计数器,并在有大量指针时使用base + offset * scale寻址(除非你展开循环),尤其是。如果您使用mov
加载它们。
以下是我的工作方式(使用英特尔SnB及更高版本的性能分析):
; no storage needed
segment .text
GLOBAL avgArray
avgArray:
; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
; if you can choose your prototype, do it so args go where you want them anyway.
; prologue
; rdi = avg
; rsi = a1
; rdx = a2
; rcx = len
; mov [rsp-8], rcx ; if I wanted to spill len to memory
add rcx, rdi
add rcx, rsi
add rcx, rdx
neg rcx ; now [rdi+rcx] is the start of dest, and we can count rcx upwards towards zero.
; We could also have just counted down towards zero
; but HW memory prefetchers have more stream slots for forward patterns than reverse.
ALIGN 16
.loop:
; use movsx for signed char
movzx eax, [rsi+rcx] ; dependency-breaker
movzx r8d, [rdx+rcx] ; Using r8d to save push/pop of rbx
; on pre-Nehalem where insn decode can be a bottleneck even in tight loops
; using ebx or ebp would save a REX prefix (1 insn byte).
add eax, r8d
shr eax, 1
mov [rdi+rcx], al
inc rcx ; No cmp needed: this is the point of counting up towards zero
jl .loop ; inc/jl can Macro-fuse into one uop
; nothing to pop, we only used caller-saved regs.
ret
在Intel上,循环是7 uops,(存储是2 uops:存储地址和存储数据,并且不能微熔合),因此每个周期可以发出4 uop的CPU将在2个周期完成每字节。 movzx
(到32或64位reg)无论如何都是1 uop,因为没有端口0/1/5 uop用于微熔合与否。 (这是一个读取,而不是读取 - 修改)。
7 uops需要2个最多4个uop的块,因此循环可以在2个周期内发出。没有其他瓶颈可以阻止执行单元跟上它,所以它应该每2个周期运行一次。
有一个向量指令可以完成这个操作:PAVGB
包含大量无符号字节(带有9位临时值以避免溢出,与add / shr相同)。
; no storage needed
segment .text
GLOBAL avgArray
avgArray:
; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
; rdi = avg
; rsi = a1
; rdx = a2
; rcx = len
; same setup
; TODO: scalar loop here until [rdx+rcx] is aligned.
ALIGN 16
.loop:
; use movsx for signed char
movdqu xmm0, [rsi+rcx] ; 1 uop
pavgb xmm0, [rdx+rcx] ; 2 uops (no micro-fusion)
movdqu [rdi+rcx], xmm0 ; 2 uops: no micro-fusion
add rcx, 16
jl .loop ; 1 macro-fused uop add/branch
; TODO: scalar cleanup.
ret
使循环退出条件正确是很棘手的,因为如果下一个16B离开数组的末尾,你需要结束向量循环。概率。最好通过将rcx减去15或其他东西来处理它,然后再将它添加到指针中。
再次,每次迭代6个uop / 2个循环,但每次迭代将执行16个字节。展开是理想的,因此你的循环是4 uop的倍数,所以你不会在循环结束时以低于4 uop的周期丢失发行率。每个周期2个加载/ 1个存储是我们的瓶颈,因为PAVGB
每个周期的吞吐量为2。
如果要打开循环,那么增加指针而不仅仅是索引是值得的。 (因此,[rdx]和一个add会有两种用途,而[rdx + rcx]则有两种用途。
无论哪种方式,清理循环设置并将所有内容保存在寄存器中都可以节省大量的指令字节和短数组的开销。