我这样打电话给GCC:
$ gcc -I/usr/include/SDL2 -D_REENTRANT -Ibuild -I. -S -fverbose-asm -O2 -m64 -mpc64 -mfpmath=both -fipa-pta -ftree-loop-linear -floop-interchange -floop-strip-mine -floop-block -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops -ftree-vectorize -march=core-avx-i -c algo/collision.c -o build/collision.s
重要的选择是:
-S : output assembly
-ftree-vectorize : vectorize loops
-march=core-avx-i : enable "MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2,
: AVX, AES, PCLMUL, FSGSBASE, RDRND and F16C
: instruction set support."
这是汇编之前的来源:
#include "collision.h"
int8_t currentField[FIELD_W][FIELD_H];
// Clear and rebuild the field based on the objects with a gravity well
void buildField (const gravityWell *body) {
int x, y;
int w, h, Cx, Cy;
int Vx[2], Vy[2];
// Clear the field
for (x = 0; x < FIELD_W; x++) {
memset (currentField[x], 0x00, FIELD_H);
}
// Rebuild the field
for (x = 0; x < body->object_count; x++) {
// Fetch the position and dimensions of the object and round
// them to ints
Cx = body->stuff[x].pos.x;
Cy = body->stuff[x].pos.y;
w = body->stuff[x].pos.w;
h = body->stuff[x].pos.h;
// Calculate the lower left and upper right edges of a
// rectangle encompassing the object
w = w / 2;
h = h / 2;
Vx[0] = Cx - w;
Vx[1] = Cx + w;
Vy[0] = Cy - h;
Vy[1] = Cy + h;
// Add in the offset for array accesses
Vx[0] += FIELD_W / 2;
Vx[1] += FIELD_W / 2;
Vy[0] += FIELD_H / 2;
Vy[1] += FIELD_H / 2;
Vx[1]++;
Vy[1]++;
// Set the area occupied by the object to ones
for (y = Vx[0]; y < Vx[1]; y++) {
memset (currentField[y], 0x01, (Vy[1] - Vy[0]));
}
}
return;
}
这里是汇编源(GAS语法):
.file "collision.c"
# GNU C (Ubuntu/Linaro 4.8.1-10ubuntu9) version 4.8.1 (x86_64-linux-gnu)
# compiled by GNU C version 4.8.1, GMP version 5.1.2, MPFR version 3.1.1-p2, MPC version 1.0.1
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed: -I /usr/include/SDL2 -I build -I .
# -imultiarch x86_64-linux-gnu -D _REENTRANT algo/collision.c -m64 -mpc64
# -mfpmath=both -march=core-avx-i -auxbase-strip build/collision.s -O2
# -fverbose-asm -fipa-pta -floop-interchange -floop-strip-mine -floop-block
# -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops
# -ftree-vectorize -fstack-protector -Wformat -Wformat-security
# options enabled: -faggressive-loop-optimizations
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fcombine-stack-adjustments -fcommon -fcompare-elim
# -fcprop-registers -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm
# -fearly-inlining -feliminate-unused-debug-types -fexpensive-optimizations
# -fforward-propagate -ffunction-cse -fgcse -fgcse-lm -fgnu-runtime
# -fguess-branch-probability -fhoist-adjacent-loads -fident -fif-conversion
# -fif-conversion2 -findirect-inlining -finline -finline-atomics
# -finline-functions-called-once -finline-small-functions -fipa-cp
# -fipa-profile -fipa-pta -fipa-pure-const -fipa-reference -fipa-sra
# -fira-hoist-pressure -fira-share-save-slots -fira-share-spill-slots
# -fivopts -fkeep-static-consts -fleading-underscore -floop-block
# -floop-interchange -floop-strip-mine -fmath-errno -fmerge-constants
# -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
# -foptimize-register-move -foptimize-sibling-calls -foptimize-strlen
# -fpartial-inlining -fpeephole -fpeephole2 -fprefetch-loop-arrays -free
# -freg-struct-return -fregmove -freorder-blocks -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-insns2
# -fshow-column -fshrink-wrap -fsigned-zeros -fsplit-ivs-in-unroller
# -fsplit-wide-types -fstack-protector -fstrict-aliasing -fstrict-overflow
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-distribution -ftree-loop-if-convert -ftree-loop-im
# -ftree-loop-ivcanon -ftree-loop-optimize -ftree-parallelize-loops=
# -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc -ftree-scev-cprop
# -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter
# -ftree-vect-loop-version -ftree-vectorize -ftree-vrp -funit-at-a-time
# -funswitch-loops -funwind-tables -fverbose-asm -fzero-initialized-in-bss
# -m128bit-long-double -m64 -m80387 -maccumulate-outgoing-args -maes
# -malign-stringops -mavx -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mcx16 -mf16c -mfancy-math-387
# -mfp-ret-in-387 -mfsgsbase -mfxsr -mglibc -mieee-fp -mlong-double-80
# -mmmx -mpc64 -mpclmul -mpopcnt -mpush-args -mrdrnd -mred-zone -msahf
# -msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -mssse3
# -mtls-direct-seg-refs -mvzeroupper -mxsave -mxsaveopt
.text
.p2align 4,,15
.globl buildField
.type buildField, @function
buildField:
.LFB24:
.cfi_startproc
pushq %r14 #
.cfi_def_cfa_offset 16
.cfi_offset 14, -16
pushq %r13 #
.cfi_def_cfa_offset 24
.cfi_offset 13, -24
movq %rdi, %r13 # body, body
pushq %r12 #
.cfi_def_cfa_offset 32
.cfi_offset 12, -32
pushq %rbp #
.cfi_def_cfa_offset 40
.cfi_offset 6, -40
pushq %rbx #
.cfi_def_cfa_offset 48
.cfi_offset 3, -48
movl $currentField, %ebx #, ivtmp.26
.p2align 4,,10
.p2align 3
.L3:
xorl %esi, %esi #
movq %rbx, %rdi # ivtmp.26,
movl $4000, %edx #,
call memset #
addq $4000, %rbx #, ivtmp.26
cmpq $currentField+16000000, %rbx #, ivtmp.26
jne .L3 #,
movl 8(%r13), %eax # body_11(D)->object_count,
xorl %r14d, %r14d # ivtmp.19
xorl %r12d, %r12d # x
testl %eax, %eax #
jle .L12 #,
.p2align 4,,10
.p2align 3
.L11:
movq %r14, %rax # ivtmp.19, D.2657
addq 0(%r13), %rax # body_11(D)->stuff, D.2657
movl 96(%rax), %edx # _16->pos.w, w
vmovss 88(%rax), %xmm0 # _16->pos.x,
vmovss 92(%rax), %xmm1 # _16->pos.y,
movl 100(%rax), %eax # _16->pos.h, h
vcvttss2si %xmm0, %esi #, Cx
movl %edx, %edi # w, tmp125
vcvttss2si %xmm1, %ecx #, Cy
shrl $31, %edi #, tmp125
addl %edi, %edx # tmp125, tmp127
movl %eax, %edi # h, tmp128
sarl %edx # tmp127
shrl $31, %edi #, tmp128
movl %ecx, %r8d # Cy, D.2655
addl %edi, %eax # tmp128, tmp130
movl %esi, %edi # Cx, D.2655
sarl %eax # tmp130
subl %edx, %edi # tmp127, D.2655
addl %esi, %edx # Cx, D.2655
leal 2001(%rcx,%rax), %ebp #, D.2655
subl %eax, %r8d # tmp130, D.2655
leal 2000(%rdi), %esi #, y
addl $2000, %r8d #, D.2655
leal 2001(%rdx), %eax #, D.2655
cmpl %eax, %esi # D.2655, y
jge .L8 #,
movslq %esi, %rax # y, D.2660
subl %edi, %edx # D.2655, D.2654
subl %r8d, %ebp # D.2655, D.2655
leaq (%rdx,%rax), %rbx #, D.2654
movslq %ebp, %rbp # D.2655, D.2661
imulq $4000, %rax, %rcx #, D.2660, D.2660
imulq $4000, %rbx, %rbx #, D.2654, D.2654
addq $currentField, %rcx #, ivtmp.12
addq $currentField+4000, %rbx #, D.2654
.p2align 4,,10
.p2align 3
.L9:
movq %rcx, %rdi # ivtmp.12,
movq %rbp, %rdx # D.2661,
movl $1, %esi #,
call memset #
movq %rax, %rcx #, ivtmp.12
addq $4000, %rcx #, ivtmp.12
cmpq %rbx, %rcx # D.2654, ivtmp.12
jne .L9 #,
.L8:
addl $1, %r12d #, x
subq $-128, %r14 #, ivtmp.19
cmpl %r12d, 8(%r13) # x, body_11(D)->object_count
jg .L11 #,
.L12:
popq %rbx #
.cfi_def_cfa_offset 40
popq %rbp #
.cfi_def_cfa_offset 32
popq %r12 #
.cfi_def_cfa_offset 24
popq %r13 #
.cfi_def_cfa_offset 16
popq %r14 #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size buildField, .-buildField
.comm currentField,16000000,32
.ident "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1"
.section .note.GNU-stack,"",@progbits
GCC使用SSE指令代替AVX指令,特别是考虑到它使用SSE的128位%xmm
寄存器而不是AVX的256位%ymm
寄存器。
为什么这样,更重要的是,我如何强制gcc
使用AVX而不是SSE?
答案 0 :(得分:11)
您的代码执行所有整数运算; AVX扩展中没有整数操作。它们被添加到您尚未启用的AVX2中。
在你重新编写所有代码以使用float
或购买带有AVX2的处理器之前,我应该指出你看起来使用的结构阵列内存布局会击败许多自动矢量化器,所以如果整数操作可用,您的代码将利用AVX并不是很明显。你可能想要考虑使用数组结构布局,尽管这也可能被证明是一个相对侵入性的变化。