为什么C比汇编这么慢?

时间:2019-11-05 19:08:08

标签: c performance assembly

计算受标量积的启发。 C版本:

#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <time.h>


int main() {
    clock_t t0, t1;
    int64_t n = 10000000; // 10 million
    int64_t *m = (int64_t*) malloc ((20*n) * sizeof(int64_t));
    for (int64_t i = 0; i < n; i++) {
        m[i] = rand() % 1000;
    }
    int64_t sum = 0;
    int64_t k = 0;
    t0 = clock();
    for (int64_t i = 0; i < n; i++) {
        sum += m[k]*m[k+1] + m[k+2]*m[k+3] + m[k+4]*m[k+5] + m[k+6]*m[k+7] + m[k+8]*m[k+9] + m[k+10]*m[k+11] + m[k+12]*m[k+13] + m[k+14]*m[k+15] + m[k+16]*m[k+17] + m[k+18]*m[k+19];
        k += 20;
    }
    t1 = clock();
    printf("%" PRId64 "\n", sum);
    float diff = ((float) (t1 - t0) / 1000000.0F ) * 1000;
    printf("%f", (float) diff);
}

常用版本:

主文件:

format PE64 console
entry prog

include "win64ax.inc"


section '.idata' import data readable writeable

library kernel32, 'kernel32.dll', msvcrt, 'msvcrt.dll'

import kernel32, ExitProcess, 'ExitProcess', GetTickCount, 'GetTickCount'
import msvcrt, printf, 'printf'


section '.text' code readable executable

macro now
{
    cinvoke GetTickCount
    cinvoke printf, <"%lld", 13, 10>, rax
}

prog:

mov r14, temp
mov r15, 10000000 ; 10 million
now
lbl:
dec r15

include "temp_code.asm"
add r14, 80

cmp r15, 0
jne lbl
now

end_prog:
    invoke ExitProcess, 0


section '.data' data readable writeable
    align 8
    temp dq 100000200 dup(0)

temp_code.asm:

mov rbx, 0
mov rax, [r14 + 0 * 8]
mov rcx, [r14 + 1 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 2 * 8]
mov rcx, [r14 + 3 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 4 * 8]
mov rcx, [r14 + 5 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 6 * 8]
mov rcx, [r14 + 7 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 8 * 8]
mov rcx, [r14 + 9 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 10 * 8]
mov rcx, [r14 + 11 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 12 * 8]
mov rcx, [r14 + 13 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 14 * 8]
mov rcx, [r14 + 15 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 16 * 8]
mov rcx, [r14 + 17 * 8]
imul rax, rcx
add rbx, rax
mov rax, [r14 + 18 * 8]
mov rcx, [r14 + 19 * 8]
imul rax, rcx
add rbx, rax
mov rax, rbx

fasm的最佳时间是93毫秒,带有编译选项“ gcc.exe -std = c99 -g 1.c -O3 -o 1.exe”的c的最佳时间是710毫秒。这慢了7.63倍。 我听说过数百次手写程序集无法击败c编译器,而现在……这是超级跑车的速度对赛跑者的速度。您的解释?

0 个答案:

没有答案