我对性能测量很陌生。我遇到this question并决定自己检查一下。以下是我的基准测试结果:
对于堆栈:
section .text
global _start
_start:
mov r12, 0xFFFFFFFF
push 0xFFFFFF
mov_loop:
mov rax, [rsp]
dec r12
jnz mov_loop
mov rax, 60
syscall
对于堆:
SYS_brk equ 0x0C
section .text
global _start
_start:
mov rax, SYS_brk
mov rdi, 0
syscall
;allocate 8 bytes
mov r10, rax
mov rax, SYS_brk
mov rdi, r10
add rdi, 0x08
syscall
mov [r10], dword 0xFFFFFF
mov rcx, 0xFFFFFFFF
heap_loop:
mov rax, [r10]
dec rcx
jnz heap_loop
;release memory
mov rax, SYS_brk
mov rdi, r10
syscall
mov rax, 60
syscall
使用perf stat -d -r 10
运行基准测试表明我实际测量了两种情况下的L1缓存负载。
4,295,747,868 L1-dcache-loads # 2996.483 M/sec ( +- 0.00% )
48,316 L1-dcache-load-misses # 0.00% of all L1-dcache hits ( +- 18.42% )
有没有办法在每次迭代开始之前使缓存行无效?