为什么GCC交换rax和xmm0寄存器?

时间:2015-11-16 21:37:33

标签: c gcc assembly compilation cpu-registers

我正在验证由 gcc版本5.2.1 20151010(Ubuntu 5.2.1-22ubuntu2)生成的一些程序集,并意识到正在生成以下指令:

movq    %xmm0, %rax
movq    %rax, %xmm0

我想知道这些说明的目的是什么,考虑到它似乎无关紧要,是否是某种优化?就像我们这样做:

xor ax, ax

我想清楚一下,当我使用选项-mtune = native并且我的CPU是 Intel Core I5 4200U 时,此代码就出现了。

以下是我的源代码:

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "print.h"

void multiply(const unsigned int* array1, const unsigned int* array2, unsigned int* array3, const unsigned int array_size)
{
    unsigned int i = 0;

    for (i = 0; i < array_size; i++)
    {
        array3[i] = array1[i] * array2[i];
    }
}

int main()
{   
    const unsigned int array_size = 1024*1024;

    unsigned int* array1 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
    unsigned int* array2 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
    unsigned int* array3 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);

    int i = 0;

    srand(time(NULL));

    for (i = 0; i < array_size; i++)
    {
        array1[i] = rand();
        array2[i] = rand();
    }

    clock_t t0 = clock();

    multiply(array1,array2,array3, array_size);
    multiply(array1,array2,array3, array_size);

    clock_t t1 = clock();

    printf("\nTempo: %f\n", ((double)(t1 - t0)) / CLOCKS_PER_SEC);
}

这是GCC使用以下方式生成的程序集: gcc -S -mtune = native Main.c

.file   "Main.c"
.text
.globl  multiply
.type   multiply, @function
multiply:
.LFB2:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -24(%rbp)
    movq    %rsi, -32(%rbp)
    movq    %rdx, -40(%rbp)
    movl    %ecx, -44(%rbp)
    movl    $0, -4(%rbp)
    movl    $0, -4(%rbp)
    jmp .L2
.L3:
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rdx
    movq    -40(%rbp), %rax
    addq    %rax, %rdx
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rcx
    movq    -24(%rbp), %rax
    addq    %rcx, %rax
    movl    (%rax), %ecx
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rsi
    movq    -32(%rbp), %rax
    addq    %rsi, %rax
    movl    (%rax), %eax
    imull   %ecx, %eax
    movl    %eax, (%rdx)
    addl    $1, -4(%rbp)
.L2:
    movl    -4(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jb  .L3
    nop
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE2:
    .size   multiply, .-multiply
    .section    .rodata
.LC1:
    .string "\nTempo: %f\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB3:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %rbx
    subq    $56, %rsp
    .cfi_offset 3, -24
    movl    $1048576, -60(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -56(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -48(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -40(%rbp)
    movl    $0, -64(%rbp)
    movl    $0, %edi
    call    time
    movl    %eax, %edi
    call    srand
    movl    $0, -64(%rbp)
    jmp .L5
.L6:
    movl    -64(%rbp), %eax
    cltq
    leaq    0(,%rax,4), %rdx
    movq    -56(%rbp), %rax
    leaq    (%rdx,%rax), %rbx
    call    rand
    movl    %eax, (%rbx)
    movl    -64(%rbp), %eax
    cltq
    leaq    0(,%rax,4), %rdx
    movq    -48(%rbp), %rax
    leaq    (%rdx,%rax), %rbx
    call    rand
    movl    %eax, (%rbx)
    addl    $1, -64(%rbp)
.L5:
    movl    -64(%rbp), %eax
    cmpl    -60(%rbp), %eax
    jb  .L6
    call    clock
    movq    %rax, -32(%rbp)
    movl    -60(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    multiply
    movl    -60(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    multiply
    call    clock
    movq    %rax, -24(%rbp)
    movq    -24(%rbp), %rax
    subq    -32(%rbp), %rax
    pxor    %xmm0, %xmm0
    cvtsi2sdq   %rax, %xmm0
    movsd   .LC0(%rip), %xmm1
    divsd   %xmm1, %xmm0
    movq    %xmm0, %rax
    movq    %rax, %xmm0
    movl    $.LC1, %edi
    movl    $1, %eax
    call    printf
    movl    $0, %eax
    addq    $56, %rsp
    popq    %rbx
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE3:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC0:
    .long   0
    .long   1093567616
    .ident  "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
    .section    .note.GNU-stack,"",@progbits

这是 gcc -S Main.c

.file   "Main.c"
.text
.globl  multiply
.type   multiply, @function
multiply:
.LFB2:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -24(%rbp)
    movq    %rsi, -32(%rbp)
    movq    %rdx, -40(%rbp)
    movl    %ecx, -44(%rbp)
    movl    $0, -4(%rbp)
    movl    $0, -4(%rbp)
    jmp .L2
.L3:
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rdx
    movq    -40(%rbp), %rax
    addq    %rax, %rdx
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rcx
    movq    -24(%rbp), %rax
    addq    %rcx, %rax
    movl    (%rax), %ecx
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rsi
    movq    -32(%rbp), %rax
    addq    %rsi, %rax
    movl    (%rax), %eax
    imull   %ecx, %eax
    movl    %eax, (%rdx)
    addl    $1, -4(%rbp)
.L2:
    movl    -4(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jb  .L3
    nop
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE2:
    .size   multiply, .-multiply
    .section    .rodata
.LC1:
    .string "\nTempo: %f\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB3:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %rbx
    subq    $56, %rsp
    .cfi_offset 3, -24
    movl    $1048576, -60(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -56(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -48(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -40(%rbp)
    movl    $0, -64(%rbp)
    movl    $0, %edi
    call    time
    movl    %eax, %edi
    call    srand
    movl    $0, -64(%rbp)
    jmp .L5
.L6:
    movl    -64(%rbp), %eax
    cltq
    leaq    0(,%rax,4), %rdx
    movq    -56(%rbp), %rax
    leaq    (%rdx,%rax), %rbx
    call    rand
    movl    %eax, (%rbx)
    movl    -64(%rbp), %eax
    cltq
    leaq    0(,%rax,4), %rdx
    movq    -48(%rbp), %rax
    leaq    (%rdx,%rax), %rbx
    call    rand
    movl    %eax, (%rbx)
    addl    $1, -64(%rbp)
.L5:
    movl    -64(%rbp), %eax
    cmpl    -60(%rbp), %eax
    jb  .L6
    call    clock
    movq    %rax, -32(%rbp)
    movl    -60(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    multiply
    movl    -60(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    multiply
    call    clock
    movq    %rax, -24(%rbp)
    movq    -24(%rbp), %rax
    subq    -32(%rbp), %rax
    pxor    %xmm0, %xmm0
    cvtsi2sdq   %rax, %xmm0
    movsd   .LC0(%rip), %xmm1
    divsd   %xmm1, %xmm0
    movl    $.LC1, %edi
    movl    $1, %eax
    call    printf
    movl    $0, %eax
    addq    $56, %rsp
    popq    %rbx
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE3:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC0:
    .long   0
    .long   1093567616
    .ident  "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
    .section    .note.GNU-stack,"",@progbits

差异可以在 .L5 标签的末尾找到。

0 个答案:

没有答案