计算受标量积的启发。 C版本:
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <time.h>
int main() {
clock_t t0, t1;
int64_t n = 10000000; // 10 million
int64_t *m = (int64_t*) malloc ((20*n) * sizeof(int64_t));
for (int64_t i = 0; i < n; i++) {
m[i] = rand() % 1000;
}
int64_t sum = 0;
int64_t k = 0;
t0 = clock();
for (int64_t i = 0; i < n; i++) {
sum += m[k]*m[k+1] + m[k+2]*m[k+3] + m[k+4]*m[k+5] + m[k+6]*m[k+7] + m[k+8]*m[k+9] + m[k+10]*m[k+11] + m[k+12]*m[k+13] + m[k+14]*m[k+15] + m[k+16]*m[k+17] + m[k+18]*m[k+19];
k += 20;
}
t1 = clock();
printf("%" PRId64 "\n", sum);
float diff = ((float) (t1 - t0) / 1000000.0F ) * 1000;
printf("%f", (float) diff);
}
常用版本:
主文件:
format PE64 console
entry prog
include "win64ax.inc"
section '.idata' import data readable writeable
library kernel32, 'kernel32.dll', msvcrt, 'msvcrt.dll'
import kernel32, ExitProcess, 'ExitProcess', GetTickCount, 'GetTickCount'
import msvcrt, printf, 'printf'
section '.text' code readable executable
macro now
{
cinvoke GetTickCount
cinvoke printf, <"%lld", 13, 10>, rax
}
prog:
mov r14, temp
mov r15, 10000000 ; 10 million
now
lbl:
dec r15
include "temp_code.asm"
add r14, 80
cmp r15, 0
jne lbl
now
end_prog:
invoke ExitProcess, 0
section '.data' data readable writeable
align 8
temp dq 100000200 dup(0)
temp_code.asm:
mov rbx, 0
mov rax, [r14 + 0 * 8]
mov rcx, [r14 + 1 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 2 * 8]
mov rcx, [r14 + 3 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 4 * 8]
mov rcx, [r14 + 5 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 6 * 8]
mov rcx, [r14 + 7 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 8 * 8]
mov rcx, [r14 + 9 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 10 * 8]
mov rcx, [r14 + 11 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 12 * 8]
mov rcx, [r14 + 13 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 14 * 8]
mov rcx, [r14 + 15 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 16 * 8]
mov rcx, [r14 + 17 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 18 * 8]
mov rcx, [r14 + 19 * 8]
imul rax, rcx
add rbx, rax
mov rax, rbx
fasm的最佳时间是93毫秒,带有编译选项“ gcc.exe -std = c99 -g 1.c -O3 -o 1.exe”的c的最佳时间是710毫秒。这慢了7.63倍。 我听说过数百次手写程序集无法击败c编译器,而现在……这是超级跑车的速度对赛跑者的速度。您的解释?