Question

我一直试图用一系列if语句来优化函数。我虽然找到了使用switch语句作为替代品的解决方案，但在测试和反汇编时，我发现他们只是复杂化了这个问题。下面是一些代码来演示;

 int ifFunc(int A, int B, int C){
   int ret;
   if (A > 0){
     if (B > 0){
       if (C > 0){
         ret = C;
       }
       else{
         ret = B;
       }
     }
     else{
       if(C > 0){
         ret = C;
       }
       else{
         ret = A;
       }
     }
   }
   else{
     if (B > 0){
       if (C > 0){
         ret = C;
       }
       else{
         ret = B;
       }
     }
     else{
       if (C > 0){
         ret = C;
       }
       else{
         ret = 0;
       }
     }
   }
   return ret;
 }

 int swFunc(int A, int B, int C){
   int ret; int code = 0;
   code += (A > 0) * 4, code += (B > 0) * 2, code += (C > 0) * 1;
   switch(code){
   case 0: // 000
     ret = 0;
     break;
   case 1: // 001
     ret = C;
     break;
   case 2: // 010
     ret = B;
     break;
   case 3: // 011
     ret = C;
     break;
   case 4: // 100
     ret = A;
     break;
   case 5: // 101
     ret = C;
     break;
   case 6: // 110
     ret = B;
     break;
   case 7: // 111
     ret = C;
     break;
   }
   return ret;
 }
 // All these functions do is select a number that is positive,
 // Giving preference to C, then B, then A

我可能犯了一些错误，所以他们可能不会做同样的事情，但这不是重点。我试图用switch语句创建的ifFunc版本只有一次跳转，通过将每个if语句的结果转换为一个与一些位对齐的数字代码，这样每个可能的终点都会有一个独特的数字代码。

然而，由于比较函数（B> 0）等在内部利用跳跃，因此这变得平坦。最终，函数的开关版本比if版本慢一个数量级。

我想知道是否还有做比较语句，并且输出零表示false而一表为true，而不使用（内部或其他方式）if语句或跳转。 / p>

Answer 1

我希望此代码可以帮助您删除程序集中的跳转...

static const int index[8] = {0, 1, 2, 2, 3, 3, 3, 3};

int ifFunc(int a, int b, int c)
{
    const int ret[4] = {0, a, b, c};
    const int bits = sizeof(int) * 8 - 1;
    //const int i = ((((c - 1) >> bits) + 1) * 4) +
    //              ((((b - 1) >> bits) + 1) * 2) +
    //               (((a - 1) >> bits) + 1);
    const int i = ((-c >> bits) * -4) -
                  ((-b >> bits) *  2) -
                   (-a >> bits);

    return ret[index[i]];
}

Answer 2

可能会这样做......？

int ifFunc(int A, int B, int C){
  if( C > 0 ) return C;
  if( B > 0 ) return B;
  if( A > 0 ) return A;
  return 0;
}

修改

对不起，我一定误解了你 - 我认为你只需要减少条件分支的数量，而不是完全删除它们。这是解决方案（除非我犯了一些错误......），基于您的系统在Two's complement代码中工作的假设：

static const unsigned iMIN = ~(~0u >> 1); // binary 1000...0 static const int BITS_PER_BYTE = 8; static inline int partialmask( int x ) { // returns positive (0....) for positive x, // negative (1....) for negative or zero x return x | (int)(iMIN - (unsigned)x); } static inline int fullmask( int x ) { // extends the sign bit so that // positive becomes binary 0000...0 for positive x // negative becomes binary 1111...1 for negative or zero x return partialmask( x ) >> (BITS_PER_BYTE * sizeof(int) - 1); } int noIfFunc(int A, int B, int C){ int res = 0, mask; mask = fullmask( A ); // negative or zero A causes an all-ones mask res &= mask; // to preserve the res value res |= A & ~mask; // and keep it from overwriting with A mask = fullmask( B ); // positive B causes res &= mask; // res to be cleared with all-zeros mask res |= B & ~mask; // then overwritten with B mask = fullmask( C ); res &= mask; res |= C & ~mask; return res; // finally res == most recent positive value (else zero) }

可能这不是最短的代码。但是，如果适当减少，它应该不包含分支（内联函数）。

Answer 3

不确定它会好得多，但您可以尝试使用位域：

union SOMECONDTIONS {
  unsigned char aggregate;
  struct {
    int c1:1;
    int c2:1;
    int c3:1;
    int c4:1;
    int c5:1;
    int c6:1;
    int c7:1;
    int c8:1;
  } conditions;
}

SOMECODITIONS c;
c.aggregate = 0;
c.conditions.c1 = A > 0;
c.conditions.c2 = B > 0;
c.conditions.c3 = C > 0;
switch(c.aggregate) {
...

Answer 4

怎么样？

if (C > 0) return C;
if (B > 0) return B;
if (A > 0) return A;
return 0;

或

return C > 0 ? C : (B > 0 ? B : (A > 0 ? A : 0)));

此外，如果编译器实现了条件赋值，

R= 0;
if (A > 0) R= A;
if (B > 0) R= B;
if (C > 0) R= C;

在最后一种情况下，MSVC确实使用条件赋值，但对于第三种情况，因为它更喜欢返回归零寄存器：（

; 7    :     int R= 0;
; 8    :     if (A > 0) R= A;

    mov ecx, DWORD PTR _A$[esp-4]
    xor eax, eax
    test    ecx, ecx
    cmovg   eax, ecx

; 9    :     if (B > 0) R= B;

    mov ecx, DWORD PTR _B$[esp-4]
    test    ecx, ecx
    cmovg   eax, ecx

; 10   :     if (C > 0) R= C;

    mov ecx, DWORD PTR _C$[esp-4]
    test    ecx, ecx
    jle SHORT $LN1@f1

; 12   :    return R;

    mov eax, ecx
$LN1@f1:

    ret 0

Answer 5

这是一个使用内联汇编几乎胜过编译器版本的解决方案。我很抱歉，如果它有点偏离，我昨晚才学会集会

 int swFunc(int A, int B, int C){
   int ret; char code;
   //code += (A > 0) * 4, code += (B > 0) * 2, code += (C > 0) * 1
   asm("movb $0, %[out];"
     "cmpl $0, %[C];"
     "setg %%al;"
     "addb %%al, %[out];"
     "cmpl $0, %[B];"
     "setg %%al;"
     "shlb $1, %%al;"
     "addb %%al, %[out];"
     "cmpl $0, %[A];"
     "setg %%al;"
     "shlb $2, %%al;"
     "addb %%al, %[out];"
   : [out] "+dl" (code)
   : [A] "m" (A), [B] "m" (B), [C] "m" (C)
   : "%eax", "%edx");

  switch(code){
   ...
    //same old switch statement
   ...
  }
   return ret;
 }

当我将asm改为;

时，有趣的是（而且非常烦人）

   asm ("cmpl $0, %[C];"
     "setg %[out];"
     "cmpl $0, %[B];"
     "setg %%al;"
     "shlb $1, %%al;"
     "addb %%al, %[out];"
     "cmpl $0, %[A];"
     "setg %%al;"
     "shlb $2, %%al;"
     "addb %%al, %[out];"
   : [out] "+dl" (code)
   : [A] "m" (A), [B] "m" (B), [C] "m" (C)
   : "%eax", "%edx");

哪个应该做同样的事情，但是用更简单，更甜蜜的方式，性能是相同的，因为编译器决定像这样引入％cl寄存器;

 movzbl -0x5(%ebp),%eax // load 'code' into the register (not needed!)
 mov    %eax,%ecx   // load 'code' into ecx?
 cmpl   $0x0,0x10(%ebp) // compare 'C' and 0
 setg   %cl         // set byte for code imediately, skips zeroing 'code'
 cmpl   $0x0,0xc(%ebp)  // compare 'B' and 0
 setg   %al
 shl    %al         // set %al, then make it equal to two if not equal to zero
 add    %al,%cl     // add it to code
 cmpl   $0x0,0x8(%ebp)  // compare 'A' and 0
 setg   %al
 shl    $0x2,%al    // set %al, then make it equal to four if not equal to zero
 add    %al,%cl     // add it to code
 mov    %cl,-0x5(%ebp)  // move the final value back onto the stack

应该注意的是，如果我要求它使用％cl，编译器将使用％dl，反之亦然。顽固的机器似乎并不希望它的记录破灭。

直接执行条件似乎是一个很好的方法，因为这种方法非常接近于击败编译器

如果条件树切换语句效率麻烦

5 个答案: