GCC生成SSE指令而不是AVX

时间:2014-02-17 22:36:03

标签: c gcc assembly sse avx

我这样打电话给GCC:

$ gcc -I/usr/include/SDL2 -D_REENTRANT -Ibuild -I. -S -fverbose-asm -O2 -m64 -mpc64 -mfpmath=both -fipa-pta -ftree-loop-linear -floop-interchange -floop-strip-mine -floop-block -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops -ftree-vectorize -march=core-avx-i -c algo/collision.c -o build/collision.s

重要的选择是:

-S                      : output assembly
-ftree-vectorize        : vectorize loops
-march=core-avx-i       : enable "MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2,
                        : AVX, AES, PCLMUL, FSGSBASE, RDRND and F16C
                        : instruction set support."

这是汇编之前的来源:

#include "collision.h"

int8_t currentField[FIELD_W][FIELD_H];

// Clear and rebuild the field based on the objects with a gravity well
void buildField (const gravityWell *body) {
    int x, y;
    int w, h, Cx, Cy;
    int Vx[2], Vy[2];

    // Clear the field
    for (x = 0; x < FIELD_W; x++) {
        memset (currentField[x], 0x00, FIELD_H);
    }

    // Rebuild the field
    for (x = 0; x < body->object_count; x++) {
        // Fetch the position and dimensions of the object and round
        // them to ints
        Cx =    body->stuff[x].pos.x;
        Cy =    body->stuff[x].pos.y;
        w = body->stuff[x].pos.w;
        h = body->stuff[x].pos.h;

        // Calculate the lower left and upper right edges of a
        // rectangle encompassing the object
        w = w / 2;
        h = h / 2;
        Vx[0] = Cx - w;
        Vx[1] = Cx + w;
        Vy[0] = Cy - h;
        Vy[1] = Cy + h;

        // Add in the offset for array accesses
        Vx[0] += FIELD_W / 2;
        Vx[1] += FIELD_W / 2;
        Vy[0] += FIELD_H / 2;
        Vy[1] += FIELD_H / 2;

        Vx[1]++;
        Vy[1]++;

        // Set the area occupied by the object to ones
        for (y = Vx[0]; y < Vx[1]; y++) {
            memset (currentField[y], 0x01, (Vy[1] - Vy[0]));
        }
    }

    return;
}

这里是汇编源(GAS语法):

    .file   "collision.c"
# GNU C (Ubuntu/Linaro 4.8.1-10ubuntu9) version 4.8.1 (x86_64-linux-gnu)
#   compiled by GNU C version 4.8.1, GMP version 5.1.2, MPFR version 3.1.1-p2, MPC version 1.0.1
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed:  -I /usr/include/SDL2 -I build -I .
# -imultiarch x86_64-linux-gnu -D _REENTRANT algo/collision.c -m64 -mpc64
# -mfpmath=both -march=core-avx-i -auxbase-strip build/collision.s -O2
# -fverbose-asm -fipa-pta -floop-interchange -floop-strip-mine -floop-block
# -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops
# -ftree-vectorize -fstack-protector -Wformat -Wformat-security
# options enabled:  -faggressive-loop-optimizations
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fcombine-stack-adjustments -fcommon -fcompare-elim
# -fcprop-registers -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm
# -fearly-inlining -feliminate-unused-debug-types -fexpensive-optimizations
# -fforward-propagate -ffunction-cse -fgcse -fgcse-lm -fgnu-runtime
# -fguess-branch-probability -fhoist-adjacent-loads -fident -fif-conversion
# -fif-conversion2 -findirect-inlining -finline -finline-atomics
# -finline-functions-called-once -finline-small-functions -fipa-cp
# -fipa-profile -fipa-pta -fipa-pure-const -fipa-reference -fipa-sra
# -fira-hoist-pressure -fira-share-save-slots -fira-share-spill-slots
# -fivopts -fkeep-static-consts -fleading-underscore -floop-block
# -floop-interchange -floop-strip-mine -fmath-errno -fmerge-constants
# -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
# -foptimize-register-move -foptimize-sibling-calls -foptimize-strlen
# -fpartial-inlining -fpeephole -fpeephole2 -fprefetch-loop-arrays -free
# -freg-struct-return -fregmove -freorder-blocks -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-insns2
# -fshow-column -fshrink-wrap -fsigned-zeros -fsplit-ivs-in-unroller
# -fsplit-wide-types -fstack-protector -fstrict-aliasing -fstrict-overflow
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-distribution -ftree-loop-if-convert -ftree-loop-im
# -ftree-loop-ivcanon -ftree-loop-optimize -ftree-parallelize-loops=
# -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc -ftree-scev-cprop
# -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter
# -ftree-vect-loop-version -ftree-vectorize -ftree-vrp -funit-at-a-time
# -funswitch-loops -funwind-tables -fverbose-asm -fzero-initialized-in-bss
# -m128bit-long-double -m64 -m80387 -maccumulate-outgoing-args -maes
# -malign-stringops -mavx -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mcx16 -mf16c -mfancy-math-387
# -mfp-ret-in-387 -mfsgsbase -mfxsr -mglibc -mieee-fp -mlong-double-80
# -mmmx -mpc64 -mpclmul -mpopcnt -mpush-args -mrdrnd -mred-zone -msahf
# -msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -mssse3
# -mtls-direct-seg-refs -mvzeroupper -mxsave -mxsaveopt

    .text
    .p2align 4,,15
    .globl  buildField
    .type   buildField, @function
buildField:
.LFB24:
    .cfi_startproc
    pushq   %r14    #
    .cfi_def_cfa_offset 16
    .cfi_offset 14, -16
    pushq   %r13    #
    .cfi_def_cfa_offset 24
    .cfi_offset 13, -24
    movq    %rdi, %r13  # body, body
    pushq   %r12    #
    .cfi_def_cfa_offset 32
    .cfi_offset 12, -32
    pushq   %rbp    #
    .cfi_def_cfa_offset 40
    .cfi_offset 6, -40
    pushq   %rbx    #
    .cfi_def_cfa_offset 48
    .cfi_offset 3, -48
    movl    $currentField, %ebx #, ivtmp.26
    .p2align 4,,10
    .p2align 3
.L3:
    xorl    %esi, %esi  #
    movq    %rbx, %rdi  # ivtmp.26,
    movl    $4000, %edx #,
    call    memset  #
    addq    $4000, %rbx #, ivtmp.26
    cmpq    $currentField+16000000, %rbx    #, ivtmp.26
    jne .L3 #,
    movl    8(%r13), %eax   # body_11(D)->object_count,
    xorl    %r14d, %r14d    # ivtmp.19
    xorl    %r12d, %r12d    # x
    testl   %eax, %eax  #
    jle .L12    #,
    .p2align 4,,10
    .p2align 3
.L11:
    movq    %r14, %rax  # ivtmp.19, D.2657
    addq    0(%r13), %rax   # body_11(D)->stuff, D.2657
    movl    96(%rax), %edx  # _16->pos.w, w
    vmovss  88(%rax), %xmm0 # _16->pos.x,
    vmovss  92(%rax), %xmm1 # _16->pos.y,
    movl    100(%rax), %eax # _16->pos.h, h
    vcvttss2si  %xmm0, %esi #, Cx
    movl    %edx, %edi  # w, tmp125
    vcvttss2si  %xmm1, %ecx #, Cy
    shrl    $31, %edi   #, tmp125
    addl    %edi, %edx  # tmp125, tmp127
    movl    %eax, %edi  # h, tmp128
    sarl    %edx    # tmp127
    shrl    $31, %edi   #, tmp128
    movl    %ecx, %r8d  # Cy, D.2655
    addl    %edi, %eax  # tmp128, tmp130
    movl    %esi, %edi  # Cx, D.2655
    sarl    %eax    # tmp130
    subl    %edx, %edi  # tmp127, D.2655
    addl    %esi, %edx  # Cx, D.2655
    leal    2001(%rcx,%rax), %ebp   #, D.2655
    subl    %eax, %r8d  # tmp130, D.2655
    leal    2000(%rdi), %esi    #, y
    addl    $2000, %r8d #, D.2655
    leal    2001(%rdx), %eax    #, D.2655
    cmpl    %eax, %esi  # D.2655, y
    jge .L8 #,
    movslq  %esi, %rax  # y, D.2660
    subl    %edi, %edx  # D.2655, D.2654
    subl    %r8d, %ebp  # D.2655, D.2655
    leaq    (%rdx,%rax), %rbx   #, D.2654
    movslq  %ebp, %rbp  # D.2655, D.2661
    imulq   $4000, %rax, %rcx   #, D.2660, D.2660
    imulq   $4000, %rbx, %rbx   #, D.2654, D.2654
    addq    $currentField, %rcx #, ivtmp.12
    addq    $currentField+4000, %rbx    #, D.2654
    .p2align 4,,10
    .p2align 3
.L9:
    movq    %rcx, %rdi  # ivtmp.12,
    movq    %rbp, %rdx  # D.2661,
    movl    $1, %esi    #,
    call    memset  #
    movq    %rax, %rcx  #, ivtmp.12
    addq    $4000, %rcx #, ivtmp.12
    cmpq    %rbx, %rcx  # D.2654, ivtmp.12
    jne .L9 #,
.L8:
    addl    $1, %r12d   #, x
    subq    $-128, %r14 #, ivtmp.19
    cmpl    %r12d, 8(%r13)  # x, body_11(D)->object_count
    jg  .L11    #,
.L12:
    popq    %rbx    #
    .cfi_def_cfa_offset 40
    popq    %rbp    #
    .cfi_def_cfa_offset 32
    popq    %r12    #
    .cfi_def_cfa_offset 24
    popq    %r13    #
    .cfi_def_cfa_offset 16
    popq    %r14    #
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE24:
    .size   buildField, .-buildField
    .comm   currentField,16000000,32
    .ident  "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1"
    .section    .note.GNU-stack,"",@progbits

GCC使用SSE指令代替AVX指令,特别是考虑到它使用SSE的128位%xmm寄存器而不是AVX的256位%ymm寄存器。

为什么这样,更重要的是,我如何强制gcc使用AVX而不是SSE?

1 个答案:

答案 0 :(得分:11)

您的代码执行所有整数运算; AVX扩展中没有整数操作。它们被添加到您尚未启用的AVX2中。

在你重新编写所有代码以使用float或购买带有AVX2的处理器之前,我应该指出你看起来使用的结构阵列内存布局会击败许多自动矢量化器,所以如果整数操作可用,您的代码将利用AVX并不是很明显。你可能想要考虑使用数组结构布局,尽管这也可能被证明是一个相对侵入性的变化。