Question

我有一个uint64数组，对于所有未设置的位（0），我做了一些评估。

评估费用并不是非常昂贵，但很少有比特未设置。分析说我花了很多时间在寻找下一个未设置位逻辑。

是否有更快的方式（在Core2duo上）？

我目前的代码可以跳过很多高1：

for(int y=0; y<height; y++) {
  uint64_t xbits = ~board[y];
  int x = 0;
  while(xbits) {
    if(xbits & 1) {
      ... with x and y
    }
    x++;
    xbits >>= 1;
  }
}

（以及关于如何/如果使用SIMD / CUDA的任何讨论，这将是一个有趣的切线！）

Answer 1

Hacker's Delight建议循环展开的二进制搜索。不漂亮，但对于稀疏的未设置位快，因为它跳过dwords / bytes / nibbles / etc.每一位都设置好。

如果你可以使用SSE4a获得一个Phenom（不幸的是不是Core2 Duo），你可以使用POPCNT来编写一个快速的设置位数功能。然后你可以得到下一个未设置位的索引：

pop(x & (~x-1))

x & (~x-1)清除下一个零位以上的设置位;然后你只需用POPCNT计算剩余的位数。

这是一个带有字节的工作示例：

    01101111 x
    10010000 ~x
    10001111 ~x-1
    00001111 x & ~x-1
pop(00001111) => 4

Answer 2

您是否考虑过一个表，它允许您一次处理每个字节。基本上通过单个下标操作，您将检索未在字节中设置的“x”值列表（您将在其中添加8 * byte-in-uint64以获得真正的“x”。

通过使用一个字节来存储一个1到8位数值（我们可以将其打包一点，但是有一个好的去价值的好处会有些失败），并假设我们有最多4个0值位（具有更多0位的字节值可以用转义码编码，这将触发一些传统的位逻辑，这是可接受的，因为这种事件的概率很低），我们需要一个表256 * 4字节= 1k。

Answer 3

如果您愿意使用assemply，则BSF（位扫描转发）将是要使用的操作。它会找到1位，所以你必须反转你的位掩码。 IIRC，如果结果为0，XOR将设置零标志，因此您可以在尝试BSF之前测试该标志。在x86上，BSF工作于32位寄存器，因此您必须分割您的值。（但是你应该首先使用32位整数，我会说。）

Answer 4

我可以想到一些优化点，比如循环展开，你可以尝试像

这样的东西

for(int y=0; y < height; y++) {

    uint64_t xbits = ~board[y];
    int x = 0;

    while(xbits) {
        if(xbits & (1 << 0)) {
          ... with x and y
        }
        if(xbits & (1 << 1)) {
          ... with x and y
        }
        if(xbits & (1 << 2)) {
          ... with x and y
        }
        if(xbits & (1 << 3)) {
          ... with x and y
        }
        if(xbits & (1 << 4)) {
          ... with x and y
        }
        if(xbits & (1 << 5)) {
          ... with x and y
        }
        if(xbits & (1 << 6)) {
          ... with x and y
        }
        if(xbits & (1 << 7)) {
          ... with x and y
        }
        x+=8;
        xbits >>= 8;
    }
}

这将删除7次循环检查，7次添加，7次移位以进行8次计算......

我能想到的另一种方法是，如果将它们设置为例如

，则忽略连续的1

while (xbits) {

    if (xbits & 0xF) {

          // Process for the four bits !!!
    }

    xbits >>= 4;
}

警告：如果位分散太多，则上述方法可能会使事情变慢： - （

Answer 5

一种方法 - 分成半字节，然后使用开关从半字节中选择位。使用模板，以便在编译时知道所选位，并帮助解开代码。

template < int i, int x >
struct process_bit {
    inline static void apply ( int y ) { };
};

template < int x >
struct process_bit < 1, x > {
    inline static void apply ( int y ) {
        evaluate ( x, y );
    }
};

template < int x, int n >
inline void process_nibble_bits ( int y ) {
    process_bit < x & 1, n >::apply( y );
    process_bit < ( x >> 1 ) & 1, n + 1 > ::apply( y );
    process_bit < ( x >> 2 ) & 1, n + 2 > ::apply( y );
    process_bit < ( x >> 3 ) & 1, n + 3 > ::apply( y );
}


template < int n >
inline void process_nibble ( uint64_t xbits, int y ) {
    uint64_t nibble = ( xbits >> n ) & 0xf;
    if ( nibble ) {
        switch ( nibble ) {
            case 0:
            process_nibble_bits < 0, n > ( y );
            break;
            case 1:
            process_nibble_bits < 1, n > ( y );
            break;
            case 2:
            process_nibble_bits < 2, n > ( y );
            break;
            case 3:
            process_nibble_bits < 3, n > ( y );
            break;
            case 4:
            process_nibble_bits < 4, n > ( y );
            break;
            case 5:
            process_nibble_bits < 5, n > ( y );
            break;
            case 6:
            process_nibble_bits < 6, n > ( y );
            break;
            case 7:
            process_nibble_bits < 7, n > ( y );
            break;
            case 8:
            process_nibble_bits < 8, n > ( y );
            break;
            case 9:
            process_nibble_bits < 9, n > ( y );
            break;
            case 10:
            process_nibble_bits < 10, n > ( y );
            break;
            case 11:
            process_nibble_bits < 11, n > ( y );
            break;
            case 12:
            process_nibble_bits < 12, n > ( y );
            break;
            case 13:
            process_nibble_bits < 13, n > ( y );
            break;
            case 14:
            process_nibble_bits < 14, n > ( y );
            break;
            case 15:
            process_nibble_bits < 15, n > ( y );
            break;
        }
    }
}

template < int i, int n >
struct bit_tree {
    inline static void apply ( uint64_t xbits, int y ) {
        // each call to here represents scan of bits in [ n, n + 2i )
        bit_tree < i >> 1, n > ::apply(xbits, y);
        bit_tree < i >> 1, n + i > ::apply(xbits, y);
    };
};


template < int i, int n >
struct bit_tree_with_guard {
    inline static void apply ( uint64_t xbits, int y ) {
        // each call to here represents scan of bits in [ n, n + 2i )
        // so this branch to execute if any in [ n, n + i ) are set

        if ( xbits & ( ( ( ( ( uint64_t ) 1LL ) << i ) - 1 ) << n ) )
            bit_tree < i >> 1, n > ::apply(xbits, y);

        if ( xbits & ( ( ( ( ( uint64_t ) 1LL ) << i ) - 1 ) << ( n + i) ) )
            bit_tree < i >> 1, n + i > ::apply(xbits, y);
    };
};

// put guards on 8 and 16 bit blocks ( for some reason using inheritance is slower ) 
template < int n >
struct bit_tree < 8, n > {
    inline static void apply ( uint64_t xbits, int y ) {
        bit_tree_with_guard < 8, n > ::apply ( xbits, y );
    }
};
template < int n >
struct bit_tree < 16, n > {
    inline static void apply ( uint64_t xbits, int y ) {
        bit_tree_with_guard < 16, n > ::apply ( xbits, y );
    }
};


template < int n >
struct bit_tree < 2, n > {
    inline static void apply ( uint64_t xbits, int y ) {
        process_nibble < n > ( xbits, y );
    }
};


void template_nibbles(int height) {
    for (int y = 0; y < height; y++) {
        uint64_t xbits = ~board[y];
        bit_tree< 32, 0>::apply ( xbits, y );
    }
}

运行它并不像ffs版本那么快，但它比其他可移植的更好，并且在结果中看起来与它们一致：

$ bin\bit_twiddle_micro_opt.exe                                               
testing will_while()... 3375000 usecs (check 1539404233,1539597930)           
testing will_ffs()... 2890625 usecs (check 675191567,1001386403)              
testing alphaneo_unrolled_8()... 3296875 usecs (check 1539404233,1539597930)  
testing template_nibbles()... 3046875 usecs (check 1539404233,1539597930)

一直使用树似乎没有任何好处;不使用半字节开关的速度较慢。有人知道一种不用手工编写16个案例的方法吗？

Answer 6

其他答案都很好。这是我的贡献：

你可以反转这个词，然后找一个循环找到最不重要的1位：

int x = something;

int lsb = x ^ ((x-1) & x);

i.e. if   x = 100100
a = (x - 1) = 100011 // these two steps turn off the lsb
b = (a & x) = 100000
c = (x ^ b) = 000100 // this step detects the lsb
lsb = c

然后告诉你是否完成了，做x ^= lsb并测试零。

如果你想将那个lsb（这是一个实际的位）变成一个位数，那么查找表或展开的二进制搜索就可以满足你的需要。

这就是你想要的吗？

Answer 7

我建议使用某种查找表（每个字节，或者短，取决于可用的资源），它会告诉你哪些位在某个值中是清楚的。

Answer 8

这是一个快速的微观基准;如果您能获得系统的统计数据，请运行它，并请添加您自己的算法！

命令行：

g++ -o bit_twiddle_mirco_opt bit_twiddle_mirco_opt.cpp -O9 -fomit-frame-pointer -DNDEBUG -march=native

代码：

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <stdint.h>

static unsigned long get_usecs() {
    struct timeval tv;
    gettimeofday(&tv,NULL);
    return tv.tv_sec*1000000+tv.tv_usec;
}

enum { MAX_HEIGHT = 64 };
uint64_t board[MAX_HEIGHT];
int xsum, ysum;

void evaluate(int x,int y) {
    xsum += x;
    ysum += y;
}

void alphaneo_unrolled_8(int height) {
    for(int y=0; y < height; y++) {
        uint64_t xbits = ~board[y];
        int x = 0;      
        while(xbits) {
            if(xbits & (1 << 0))
                evaluate(x,y);
            if(xbits & (1 << 1))
                evaluate(x+1,y);
            if(xbits & (1 << 2))
                evaluate(x+2,y);
            if(xbits & (1 << 3))
                evaluate(x+3,y);
            if(xbits & (1 << 4))
                evaluate(x+4,y);
            if(xbits & (1 << 5))
                evaluate(x+5,y);
            if(xbits & (1 << 6))
                evaluate(x+6,y);
            if(xbits & (1 << 7))
                evaluate(x+7,y);
            x+=8;
            xbits >>= 8;
        }
    }
}

void will_while(int height) {
    for(int y=0; y<height; y++) {
        uint64_t xbits = ~board[y];
        int x = 0;
        while(xbits) {
            if(xbits & 1)
                evaluate(x,y);
            xbits >>= 1;
            x++;
        }
    }
}

void will_ffs(int height) {
    for(int y=0; y<height; y++) {
        uint64_t xbits = ~board[y];
        int x = __builtin_ffsl(xbits);
        while(x) {
            evaluate(x-1,y);
            xbits >>= x;
            xbits <<= x;
            x = __builtin_ffsl(xbits);
        }
    }
}

void rnd_board(int dim) {
    for(int y=0; y<dim; y++) {
        board[y] = ~(((uint64_t)1 << dim)-1);
        for(int x=0; x<dim; x++)
            if(random() & 1)
                board[y] |= (uint64_t)1 << x;
    }
}

void test(const char* name,void(*func)(int)) {
    srandom(0);
    printf("testing %s... ",name);
    xsum = ysum = 0;
    const unsigned long start = get_usecs();
    for(int i=0; i<100000; i++) {
        const int dim = (random() % MAX_HEIGHT) + 1;
        rnd_board(dim);
        func(dim);
    }
    const unsigned long stop = get_usecs();
    printf("%lu usecs (check %d,%d)\n",stop-start,xsum,ysum);
}

int main() {
    test("will_while()",will_while);
    test("will_ffs()",will_ffs);
    test("alphaneo_unrolled_8()",alphaneo_unrolled_8);
    return 0;
}

Answer 9

你的分析是否表明你主要是在内部循环中花费时间，或者你是否花费大部分时间进行〜[y]计算然后立即递增y？

如果是后者，你可能最好有一个二级位图，该地图中的每一位都消除了你的电路板位图中的整个64b字 - 这样你就可以进一步向后跳过，如果你很幸运，避免加载位图的整个缓存行。

位图中设置的位数分布是什么？

Answer 10

如果你没有很少的未设置位，那么根本不要使用位域，使用稀疏表示。这样，我的意思是保持一个包含每个未设置位的索引的整数数组。迭代未设置的位只是迭代数组。设置和清除位变得更加复杂，但如果找到未设置位是最昂贵的操作，使用稀疏表示可能会获胜。

Answer 11

如果您认为未设置的位不常见，那么可能是一个简单的

if (xbits != ((uint64_t)-1))
{
   // regular code goes here
}

将是一场胜利。这样，在常见情况下（单词中的所有位都已设置），您将一次跳过64个设置位。

Answer 12

查找表版本的变体：有一个查找表，用于下一个8位的未设置位。检查8位块和AND到0xFF，比较以查看结果是否仍为0xFF。如果是，跳过，否则在桌面查找？

访问位域中的所有空闲位置

12 个答案: