Question

我一直在尝试以下内容，并注意到此处定义的无分支“if”（现在使用&-!!替换*!!）可以加快某些瓶颈代码的速度（几乎是64位英特尔目标上的2x with clang：

// Produces x if f is true, else 0 if f is false.
#define  BRANCHLESS_IF(f,x)          ((x) & -((typeof(x))!!(f)))

// Produces x if f is true, else y if f is false.
#define  BRANCHLESS_IF_ELSE(f,x,y)  (((x) & -((typeof(x))!!(f))) | \
                                     ((y) & -((typeof(y)) !(f))))

请注意f应该是一个相当简单的表达式，没有副作用，因此编译器可以进行最佳的优化。

性能高度依赖于CPU和编译器。 clang的无分支'if'表现非常出色;我还没有找到任何无分支的“if / else”更快的情况。

我的问题是：这些是安全的，可移植的吗（意味着可以保证在所有目标上都能得到正确的结果），并且可以更快地制作出来吗？

无分支if / else

的示例用法

这些计算64位最小值和最大值。

inline uint64_t uint64_min(uint64_t a, uint64_t b)
{
  return BRANCHLESS_IF_ELSE((a <= b), a, b);
}

inline uint64_t uint64_max(uint64_t a, uint64_t b)
{
  return BRANCHLESS_IF_ELSE((a >= b), a, b);
}

无分支的示例用法

这是64位模块化添加 - 它计算(a + b) % n。分支版本（未显示）受到分支预测失败的严重影响，但无分支版本非常快（至少与clang一样）。

inline uint64_t uint64_add_mod(uint64_t a, uint64_t b, uint64_t n)
{
  assert(n > 1); assert(a < n); assert(b < n);

  uint64_t c = a + b - BRANCHLESS_IF((a >= n - b), n);

  assert(c < n);
  return c;
}

更新：无分支的完整具体工作示例

下面是一个完整的C11程序，它演示了分支和简单if条件的无分支版本之间的速度差异，如果您想在系统上尝试它。该程序计算模幂运算，即(a ** b) % n，用于非常大的值。

要编译，请在命令行中使用以下命令：

-O3（或您喜欢的任何高优化级别）
-DNDEBUG（禁用断言，速度）
-DBRANCHLESS=0或-DBRANCHLESS=1分别指定分支或无分支行为

在我的系统上，发生了什么：

$ cc -DBRANCHLESS=0 -DNDEBUG -O3 -o powmod powmod.c && ./powmod
BRANCHLESS = 0
CPU time:  21.83 seconds
foo = 10585369126512366091

$ cc -DBRANCHLESS=1 -DNDEBUG -O3 -o powmod powmod.c && ./powmod
BRANCHLESS = 1
CPU time:  11.76 seconds
foo = 10585369126512366091

$ cc --version
Apple LLVM version 6.0 (clang-600.0.57) (based on LLVM 3.5svn)
Target: x86_64-apple-darwin14.1.0
Thread model: posix

因此，无分支版本几乎是我系统上分支版本的两倍（3.4 GHz。英特尔酷睿i7）。

// SPEED TEST OF MODULAR MULTIPLICATION WITH BRANCHLESS CONDITIONALS

#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
#include <time.h>
#include <assert.h>

typedef  uint64_t  uint64;

//------------------------------------------------------------------------------
#if BRANCHLESS
  // Actually branchless.
  #define  BRANCHLESS_IF(f,x)          ((x) & -((typeof(x))!!(f)))
  #define  BRANCHLESS_IF_ELSE(f,x,y)  (((x) & -((typeof(x))!!(f))) | \
                                       ((y) & -((typeof(y)) !(f))))
#else
  // Not actually branchless, but used for comparison.
  #define  BRANCHLESS_IF(f,x)          ((f)? (x) : 0)
  #define  BRANCHLESS_IF_ELSE(f,x,y)   ((f)? (x) : (y))
#endif

//------------------------------------------------------------------------------
// 64-bit modular multiplication.  Computes (a * b) % n without division.

static uint64 uint64_mul_mod(uint64 a, uint64 b, const uint64 n)
{
  assert(n > 1); assert(a < n); assert(b < n);

  if (a < b) { uint64 t = a; a = b; b = t; }  // Ensure that b <= a.

  uint64 c = 0;
  for (; b != 0; b /= 2)
  {
    // This computes c = (c + a) % n if (b & 1).
    c += BRANCHLESS_IF((b & 1), a - BRANCHLESS_IF((c >= n - a), n));
    assert(c < n);

    // This computes a = (a + a) % n.
    a += a - BRANCHLESS_IF((a >= n - a), n);
    assert(a < n);
  }

  assert(c < n);
  return c;
}

//------------------------------------------------------------------------------
// 64-bit modular exponentiation.  Computes (a ** b) % n using modular
// multiplication.

static
uint64 uint64_pow_mod(uint64 a, uint64 b, const uint64 n)
{
  assert(n > 1); assert(a < n);

  uint64 c = 1;

  for (; b > 0; b /= 2)
  {
    if (b & 1)
      c = uint64_mul_mod(c, a, n);

    a = uint64_mul_mod(a, a, n);
  }

  assert(c < n);
  return c;
}

//------------------------------------------------------------------------------
int main(const int argc, const char *const argv[const])
{
  printf("BRANCHLESS = %d\n", BRANCHLESS);

  clock_t clock_start = clock();

  #define SHOW_RESULTS 0

  uint64 foo = 0;  // Used in forcing compiler not to throw away results.

  uint64 n = 3, a = 1, b = 1;
  const uint64 iterations = 1000000;
  for (uint64 iteration = 0; iteration < iterations; iteration++)
  {
    uint64 c = uint64_pow_mod(a%n, b, n);

    if (SHOW_RESULTS)
    {
      printf("(%"PRIu64" ** %"PRIu64") %% %"PRIu64" = %"PRIu64"\n",
             a%n, b, n, c);
    }
    else
    {
      foo ^= c;
    }

    n = n * 3 + 1;
    a = a * 5 + 3;
    b = b * 7 + 5;
  }

  clock_t clock_end = clock();
  double elapsed = (double)(clock_end - clock_start) / CLOCKS_PER_SEC;
  printf("CPU time:  %.2f seconds\n", elapsed);

  printf("foo = %"PRIu64"\n", foo);

  return 0;
}

第二次更新：英特尔与ARM性能

测试32位ARM目标（iPhone 3GS / 4S，iPad 1/2/3/4，由Xcode 6.1和clang编译）显示无分支“if”在这里实际上是2-3次?:更慢。因此，如果需要最大速度，这些无分支宏似乎不是一个好主意，尽管它们在需要恒定速度的极少数情况下可能很有用。
在64位ARM目标（iPhone 6 +，iPad 5）上，无分支“if”的运行速度与三元?:相同 - 再次由Xcode 6.1与clang编译。
对于Intel和ARM（由clang编译），无分支“if / else”的速度大约是三元?:的两倍，用于计算最小值/最大值。

Answer 1

确保这是可移植的，!运算符可以保证提供0或1。然后将其提升为另一个操作数所需的任何类型。

正如其他人所观察到的那样，你的if-else版本有两次评估的缺点，但你已经知道了，如果没有副作用你就可以了。

令我惊讶的是，你说这更快。我原以为现代编译器本身会进行这种优化。

编辑：所以我用两个编译器（gcc和clang）和配置的两个值测试了这个。

事实上，如果您不忘记设置-DNDEBUG=1，那么0 ?:版本的imul对于gcc来说要好得多，并且会做我希望做的事情。它基本上使用条件移动使循环无分支。在那种情况下，clang找不到这种优化并进行一些条件跳转。

对于具有算术运算的版本，gcc的性能会恶化。事实上，看到他做了什么，这并不奇怪。它确实使用char *myhtml = NULL; // the accumulated lines char *line = NULL; // the current line int len; // set by the readline() function // read and discard first two lines getline(&line, &len, fp); free(line); line = NULL; getline(&line, &len, fp); free(line); line = NULL; while ((getline(&line, &len, fp)) != -1) { char *temp = NULL; // so do not lose pointer to already allocated memory if( NULL ==(temp=realloc(myhtml,strlen(myhtml)+len+1) ) ) ) { // realloc failed perror( "realloc failed"); free( myhtml ); // cleanup free( line ); // cleanup exit( EXIT_FAILURE); } // implied else, realloc successful myhtml = temp; strcat(myhtml,line); free(line); // getline() performs a hidden malloc(), so have to free()) line = NULL; // reset for next iteration }指令，而且速度很慢。 clang在这里下车得更好。 “算术”实际上已经优化了乘法，并用条件移动取而代之。

总而言之，是的，这是可移植的，但如果这带来性能提升或恶化将取决于您的编译器，其版本，您正在应用的编译标志，您的处理器的潜力......

整数上的无分支条件 - 速度快，但是它们可以更快吗？

更新：无分支的完整具体工作示例

第二次更新：英特尔与ARM性能

1 个答案: