通常我会让编译器做优化复杂逻辑表达式的神奇之处,但是,在这种情况下我必须使用的编译器并不是很擅长(基本上所有它可以做的就是用bit替换像/ 64这样的东西-shifts和%512 with bitwise-and。
是否有任何可用的工具可以分析并提供表达式的优化版本(即优化编译器的优化方式相同)?
e.g。我想优化以下内容:
int w = 2 - z/2;
int y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
int x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
int i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
答案 0 :(得分:1)
这是一个测试:
typedef int MyInt; // or unsigned int
MyInt get(MyInt x, MyInt y, MyInt z, MyInt v, MyInt mb)
{
MyInt w = 2 - z/2;
MyInt y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
MyInt x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
MyInt i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
return i;
}
我使用-O3
编译了GCC 4.7.0。
使用int
:
.LFB0:
movl %ecx, %eax
movq %r12, -24(%rsp)
.LCFI0:
movl %edx, %r12d
sarl $31, %eax
shrl $31, %r12d
movq %r13, -16(%rsp)
shrl $23, %eax
addl %edx, %r12d
movq %rbx, -40(%rsp)
leal (%rcx,%rax), %r9d
movl %r12d, %r11d
movq %r14, -8(%rsp)
sarl %r11d
movq %rbp, -32(%rsp)
.LCFI1:
movl %edx, %ebp
andl $511, %r9d
negl %r11d
subl %eax, %r9d
leal 511(%rcx), %eax
testl %ecx, %ecx
leal 2(%r11), %r13d
leal 63(%r9), %ebx
cmovns %ecx, %eax
sarl $9, %eax
movl %r13d, %r14d
xorl $3, %r14d
movl %eax, %edx
testl %r9d, %r9d
cmovns %r9d, %ebx
sarl $31, %edx
addl $1, %r11d
idivl %r8d
movl %ebx, %r10d
sarl $31, %ebx
shrl $30, %ebx
sarl $6, %r10d
addl %ebx, %r10d
andl $3, %r10d
subl %ebx, %r10d
movq -40(%rsp), %rbx
sall $3, %r10d
sall $3, %edx
imull %r11d, %r10d
imull %r13d, %edx
movq -16(%rsp), %r13
addl %edi, %r10d
addl %edx, %r10d
leal 255(%r9), %edx
imull %r10d, %r14d
testl %r9d, %r9d
cmovs %edx, %r9d
sall $4, %eax
sarl %r12d
sarl $8, %r9d
leal (%rsi,%r9,8), %ecx
addl %eax, %ecx
leal -3(%rbp,%rbp), %eax
movq -32(%rsp), %rbp
imull %r8d, %ecx
imull %r12d, %eax
movq -24(%rsp), %r12
sall $4, %ecx
addl %r14d, %ecx
movq -8(%rsp), %r14
leal (%rax,%rcx,2), %eax
ret
使用unsigned int
:
.LFB0:
movl %ecx, %eax
movq %rbp, -16(%rsp)
movl %edx, %r11d
.LCFI0:
movl %edx, %ebp
shrl $9, %eax
xorl %edx, %edx
divl %r8d
movq %r12, -8(%rsp)
.LCFI1:
movl %ecx, %r12d
shrl %r11d
andl $511, %r12d
movq %rbx, -24(%rsp)
.LCFI2:
movl $2, %r10d
movl %r12d, %r9d
movl $1, %ebx
subl %r11d, %r10d
shrl $6, %r9d
subl %r11d, %ebx
shrl $8, %r12d
andl $3, %r9d
sall $4, %r8d
imull %ebx, %r9d
leal (%r12,%rax,2), %eax
movq -24(%rsp), %rbx
imull %r10d, %edx
xorl $3, %r10d
movq -8(%rsp), %r12
leal (%rsi,%rax,8), %eax
addl %edx, %r9d
leal (%rdi,%r9,8), %edi
imull %eax, %r8d
leal -3(%rbp,%rbp), %eax
movq -16(%rsp), %rbp
imull %r10d, %edi
imull %r11d, %eax
addl %edi, %r8d
leal (%rax,%r8,2), %eax
ret
通过手动折叠常数进一步“优化”(可预测)没有进一步的效果。
答案 1 :(得分:1)
当我想要优化时,我倾向于检查Clang生成的LLVM IR。它比纯组装更具可读性(我发现)。
int foo(int v, int mb, int x, int y, int z) {
int w = 2 - z/2;
// When you have specific constraints, tell the optimizer about it !
if (w < 0 || w > 2) { return 0; }
int y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
int x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
int i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
return i;
}
转化为:
define i32 @foo(i32 %v, i32 %mb, i32 %x, i32 %y, i32 %z) nounwind uwtable readnone {
%1 = sdiv i32 %z, 2
%2 = sub nsw i32 2, %1
%3 = icmp slt i32 %2, 0
%4 = icmp slt i32 %z, -1
%or.cond = or i1 %3, %4
br i1 %or.cond, label %31, label %5
; <label>:5 ; preds = %0
%6 = srem i32 %v, 512
%7 = sdiv i32 %6, 64
%8 = sdiv i32 %6, 256
%9 = shl i32 %8, 3
%10 = sdiv i32 %v, 512
%11 = sdiv i32 %10, %mb
%12 = shl i32 %11, 4
%13 = add i32 %9, %y
%14 = add i32 %13, %12
%15 = srem i32 %7, 4
%16 = add nsw i32 %2, -1
%17 = mul i32 %16, %15
%18 = srem i32 %10, %mb
%19 = mul i32 %2, %18
%tmp = add i32 %19, %17
%tmp2 = shl i32 %tmp, 3
%20 = add nsw i32 %tmp2, %x
%21 = shl i32 %2, 1
%22 = xor i32 %21, 6
%23 = mul i32 %22, %20
%24 = shl i32 %mb, 5
%25 = mul i32 %24, %14
%26 = shl i32 %z, 1
%27 = add nsw i32 %26, -3
%28 = mul nsw i32 %1, %27
%29 = add i32 %25, %28
%30 = add i32 %29, %23
br label %31
; <label>:31 ; preds = %5, %0
%.0 = phi i32 [ %30, %5 ], [ 0, %0 ]
ret i32 %.0
}
我不知道它是否是最佳的,但它确实相对可读。
如果您可以在输入上指出所有约束(必要时全部五个),那将是很好的,因为优化器可能能够使用它们。