
时间:2016-09-14 03:29:47

标签: arrays caching assembly x86 simd




我的问题是:即使下面这个例子只是计算一些四元素向量并在退出之前吐出四个数字,我不得不怀疑这是否 - 将数字放回到技术上的数组中反向顺序 - 对现实世界中的缓存未命中有任何影响,在一个典型的生产级程序中,每秒执行数十万个SIMD向量计算(更具体地说,将它们返回到内存中)?

以下是完整的代码(linux 64位NASM),其中包含原始注释,这些注释促使我将我的好奇心带到了stackexchange:

extern printf
extern fflush

global _start
section .data
outputText:     db '[%f, %f, %f, %f]',10,0

align 16
vec1:    dd 1.0, 2.0, 3.0, 4.0
vec2:    dd 10.0,10.0,10.0,50.0

section .bss
result:  resd 4       ; four 32-bit single-precision floats

section .text
    sub rsp,16

    movaps xmm0,[vec1]
    movaps xmm1,[vec2]

    mulps xmm0,xmm1          ; xmm0 = (vec1 * vec2)

    movaps [result],xmm0     ; copy 4 floats back to result[]

    ; printf only accepts 64-bit floats for some dumb reason,
    ; so convert these 32-bit floats packed within the 128-bit xmm0
    ; register into four 64-bit floats, each in a separate xmm* reg
    movss xmm0,[result+12]   ; result[3]
    unpcklps xmm0,xmm0       ; 32-->64 bit
    cvtps2pd xmm3,xmm0       ; put double in 4th XMM

    movss xmm0,[result+8]    ; result[2]
    unpcklps xmm0,xmm0       ; 32-->64 bit
    cvtps2pd xmm2,xmm0       ; put double in 3rd XMM

    movss xmm0,[result+4]    ; result[1]
    unpcklps xmm0,xmm0       ; 32-->64 bit
    cvtps2pd xmm1,xmm0       ; put double in 2nd XMM

    movss xmm0,[result]      ; result[0]
    unpcklps xmm0,xmm0       ; 32-->64 bit
    cvtps2pd xmm0,xmm0       ; put double in 1st XMM

    ; *****************
    ; That was done backwards, going from highest element 
    ; of what is technically an array down to the lowest.
    ; This is because when it was done from lowest to
    ; highest, this garbled bird poop was the answer:
    ; [13510801139695616.000000, 20.000000, 30.000000, 200.000000]
    ; HOWEVER, if the correct way is this way, in which
    ; it traipses through an array backwards...
    ; is that not cache-unfriendly?  Or is it too tiny and
    ; miniscule to have any impact with cache misses?

    mov rdi, outputText     ; tells printf where is format string

    mov rax,4               ; tells printf to print 4 XMM regs
    call printf

    mov rdi,0
    call fflush             ; ensure we see printf output b4 exit

    add rsp,16

    mov eax,1            ; syscall id for sys_exit
    mov ebx,0            ; exit with ret of 0 (no error)
    int 80h

1 个答案:

答案 0 :(得分:0)



有关更多链接,请参阅标记wiki,尤其是Agner Fog's Optimizing Assembly guide,以了解如何编写asm并不比编译器更慢。标签wiki还包含指向英特尔手册的链接。


Printf仅接受double,因为arg升级为可变参数函数的C规则。是的,这有点愚蠢,但FP-> base-10-text转换使额外的float-> double转换的开销相形见绌。如果你需要高性能的FP->字符串,你可能应该避免使用每次调用都必须解析格式字符串的函数。



  • 这是64位代码,因此请勿使用32位int 0x80 ABI退出。
  • UNPCKLPS指令毫无意义,因为你只关心低元素。 CVTPS2PD会生成两个结果,但您要将两个相同的数字并行转换,而不是转换两个,然后然后解压缩。只有XMM中的低double在调用带标量参数的函数时很重要,所以你可以留下很高的垃圾。
  • 存储/重装也毫无意义

DEFAULT REL            ; use RIP-relative addressing for [vec1]

extern printf
;extern fflush         ; just call exit(3) instead of manual fflush
extern exit

section .rodata        ; read-only data can be part of the text segment
outputText:     db '[%f, %f, %f, %f]',10,0

align 16
vec1:    dd 1.0, 2.0, 3.0, 4.0
vec2:    dd 10.0,10.0,10.0,50.0

section .bss
;; static scratch space is unwise.  Use the stack to reduce cache misses, and for thread safety
; result:  resd 4       ; four 32-bit single-precision floats

section .text
global _start
    ;; sub rsp,16            ; What was this for?  We have a red-zone in x86-64 SysV, and we don't use

    movaps    xmm2, [vec1]
    ; fold the load into the mulps
    mulps     xmm2, [vec2]   ; (vec1 * vec2)

    ; printf only accepts 64-bit doubles, because it's a C variadic function.
    ; so convert these 32-bit floats packed within the 128-bit xmm0
    ; register into four 64-bit floats, each in a separate xmm* reg

    ; xmm2 = [f0,f1,f2,f3]
    cvtps2pd  xmm0, xmm2     ; xmm0=[d0,d1]
    movaps    xmm1, xmm0
    unpckhpd  xmm1, xmm1     ; xmm1=[d1,d1]

    unpckhpd  xmm2, xmm2     ; xmm2=[f2,f3, f2,f3]

    cvtps2pd  xmm2, xmm2     ; xmm2=[d2,d3]
    movaps    xmm3, xmm3
    unpckhpd  xmm3, xmm3     ; xmm3=[d3,d3]

    mov       edi, outputText     ; static data is in the low 2G, so we can use 32-bit absolute addresses
    ;lea      rdi, [outputText]   ; or this is the PIC way to do it

    mov       eax,4               ; tells printf to print 4 XMM regs
    call      printf

    xor       edi, edi
    ;call      fflush              ; flush before _exit()
    jmp       exit                 ; tailcall exit(3) which does flush, like if you returned from main()

    ; add rsp,16

;; this is how you would exit if you didn't use the libc function.
    xor       edi, edi
    mov       eax, 231             ;  exit_group(0)
    syscall                        ; 64-bit code should use the 64-bit ABI


    cvtps2pd  xmm0, xmm2     ; xmm0=[d0,d1]

    ;movaps    xmm1, xmm0
    ;unpckhpd  xmm1, xmm1     ; xmm1=[d1,d1]

    ;xorps     xmm1, xmm1    ; break the false dependency
    movhlps   xmm1, xmm0     ; xmm1=[d1,??]  ; false dependency on old value of xmm1

在Sandybridge上,xorps和movhlps会更有效率,因为它可以handle xor-zeroing without using an execution unit。 IvyBridge和更高版本以及AMD CPU可以以相同的方式消除MOVAPS:零延迟。但仍需要一个uop和一些前端吞吐量资源。

如果您要存储和重新加载,并单独转换每个浮点数,则可以使用CVTSS2SD作为加载(cvtss2sd xmm2, [result + 12])或movss之后。
