
时间:2010-04-06 23:42:59

标签: c bit-manipulation 32-bit



17 个答案:

答案 0 :(得分:39)

GCC的__builtin_clz转换为x86 / x64上的BSR,ARM上的CLZ等,如果硬件没有实现,则模拟指令。
Visual C ++ 2005及更高版本有_BitScanReverse

答案 1 :(得分:20)

<强> TL:博士;对于32位,请使用de Bruijn multiplication


当输入为零时,de Bruijn算法也会返回正确的结果。当输入为零时, __builtin_clz和_BitScanReverse指令return incorrect results

在Windows x86-64上, de Bruijn乘法的运行速度与等效(有缺陷的)Windows函数相当,性能差异仅为3%左右。


u32 msbDeBruijn32( u32 v )
    static const int MultiplyDeBruijnBitPosition[32] =
        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    return MultiplyDeBruijnBitPosition[( u32 )( v * 0x07C4ACDDU ) >> 27];


这是一个简单的C ++ 11工具来测试所有这些实现。它在Visual Studio上编译干净,但应该适用于所有现代编译器。它允许您在性能模式(bVerifyResults = false)和检查模式(bVerifyResults = true)下运行基准。


Verification failed for msbNative64: input was 0; output was 818af060; expected 0
Verification failed for msbFfs: input was 22df; output was 0; expected d
Verification failed for msbPerformanceJunkie32: input was 0; output was ffffffff; expected 0
Verification failed for msbNative32: input was 0; output was 9ab07060; expected 0

当输入为零时,“性能迷”和Microsoft本机实现会做不同的事情。 msbPerformanceJunkie32产生-1,而Microsoft的_BitScanReverse产生一个随机数,与底层硬件指令一致。此外,msbPerformanceJunkie32实现产生的结果与所有其他答案中的结果相差不了。


msbLoop64 took 2.56751 seconds               
msbNative64 took 0.222197 seconds            

msbLoop32 took 1.43456 seconds               
msbFfs took 0.525097 seconds                 
msbPerformanceJunkie32 took 1.07939 seconds  
msbDeBruijn32 took 0.224947 seconds          
msbNative32 took 0.218275 seconds            

de Bruijn版本胜过其他实现完全,因为它是无分支的,因此它可以很好地对抗产生均匀分布的输出集的输入。由于对现代CPU的分支错误预测的惩罚,所有其他版本对任意输入的速度较慢。 smbFfs函数产生不正确的结果,因此可以忽略它。



#include <iostream>
#include <chrono>
#include <random>
#include <cassert>
#include <string>
#include <limits>

#ifdef _MSC_VER
#include <intrin.h>
#endif // _MSC_VER

const int iterations = 100000000;
bool bVerifyResults = false;
std::random_device rd;
std::default_random_engine re(rd());
typedef unsigned int u32;
typedef unsigned long long u64;

class Timer
    Timer() : beg_(clock_::now()) {}
    void reset() {
        beg_ = clock_::now();
    double elapsed() const {
        return std::chrono::duration_cast<second_>
            (clock_::now() - beg_).count();

    typedef std::chrono::high_resolution_clock clock_;
    typedef std::chrono::duration<double, std::ratio<1> > second_;
    std::chrono::time_point<clock_> beg_;

unsigned int msbPerformanceJunkie32(u32 x)
    static const unsigned int bval[] =
    { 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 };
    unsigned int r = 0;
    if (x & 0xFFFF0000) {
        r += 16 / 1;
        x >>= 16 / 1;
    if (x & 0x0000FF00) {
        r += 16 / 2;
        x >>= 16 / 2;
    if (x & 0x000000F0) {
        r += 16 / 4;
        x >>= 16 / 4;
    return r + bval[x];

#define FFS(t)  \
{ \
register int n = 0; \
if (!(0xffff & t)) \
n += 16; \
if (!((0xff << n) & t)) \
n += 8; \
if (!((0xf << n) & t)) \
n += 4; \
if (!((0x3 << n) & t)) \
n += 2; \
if (!((0x1 << n) & t)) \
n += 1; \
return n; \

unsigned int msbFfs32(u32 x)

unsigned int msbLoop32(u32 x)
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;

unsigned int msbLoop64(u64 x)
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;

u32 msbDeBruijn32(u32 v)
    static const int MultiplyDeBruijnBitPosition[32] =
        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    return MultiplyDeBruijnBitPosition[(u32)(v * 0x07C4ACDDU) >> 27];

u32 msbNative32(u32 val)
    unsigned long result;
    _BitScanReverse(&result, val);
    return result;
u32 msbNative64(u64 val)
    unsigned long result;
    _BitScanReverse64(&result, val);
    return result;

template <typename InputType>
void test(unsigned int msbFunc(InputType),
    const std::string &name,
    const std::vector< InputType > &inputs,
    std::vector< unsigned int > &results,
    bool bIsReference = false
    if (bIsReference)
        int i = 0;
        for (int i = 0; i < iterations; i++)
            results[i] = msbFunc(inputs[i]);
    InputType result;
    if (bVerifyResults)
        bool bNotified = false;
        for (int i = 0; i < iterations; i++)
            result = msbFunc(inputs[i]);
            if ((result != results[i]) && !bNotified)
                std::cout << "Verification failed for " << name << ": "
                    << "input was " << std::hex << inputs[i]
                    << "; output was " << result
                    << "; expected " << results[i]
                    << std::endl;
                bNotified = true;
        Timer t;
        for (int i = 0; i < iterations; i++)
            result = msbFunc(inputs[i]);
        double elapsed = t.elapsed();
        if ( !bIsReference )
            std::cout << name << " took " << elapsed << " seconds" << std::endl;
        if (result == -1.0f)
            std::cout << "this comparison only exists to keep the compiler from " <<
            "optimizing out the benchmark; this branch will never be called";

void main()
    std::uniform_int_distribution <u64> dist64(0,
        std::numeric_limits< u64 >::max());
    std::uniform_int_distribution <u32> shift64(0, 63);
    std::vector< u64 > inputs64;
    for (int i = 0; i < iterations; i++)
        inputs64.push_back(dist64(re) >> shift64(re));
    std::vector< u32 > results64;

    test< u64 >(msbLoop64, "msbLoop64", inputs64, results64, true);
    test< u64 >(msbLoop64, "msbLoop64", inputs64, results64, false);
    test< u64 >(msbNative64, "msbNative64", inputs64, results64, false);
    std::cout << std::endl;

    std::uniform_int_distribution <u32> dist32(0,
        std::numeric_limits< u32 >::max());
    std::uniform_int_distribution <u32> shift32(0, 31);
    std::vector< u32 > inputs32;
    for (int i = 0; i < iterations; i++)
        inputs32.push_back(dist32(re) >> shift32(re));
    std::vector< u32 > results32;

    test< u32 >(msbLoop32, "msbLoop32", inputs32, results32, true);

    test< u32 >(msbLoop32, "msbLoop32", inputs32, results32, false);
    test< u32 >(msbFfs32, "msbFfs", inputs32, results32, false);
    test< u32 >(msbPerformanceJunkie32, "msbPerformanceJunkie32",
        inputs32, results32, false);
    test< u32 >(msbDeBruijn32, "msbDeBruijn32", inputs32, results32, false);
    test< u32 >(msbNative32, "msbNative32", inputs32, results32, false);

答案 2 :(得分:19)


unsigned int msb32(unsigned int x)
    static const unsigned int bval[] =

    unsigned int r = 0;
    if (x & 0xFFFF0000) { r += 16/1; x >>= 16/1; }
    if (x & 0x0000FF00) { r += 16/2; x >>= 16/2; }
    if (x & 0x000000F0) { r += 16/4; x >>= 16/4; }
    return r + bval[x];

答案 3 :(得分:12)


查看一些实现here(在“整数日志库2”下)。如果您正在使用GCC,请查看函数__builtin_clz__builtin_clzl(分别针对非零无符号整数和无符号长整数执行此操作)。 “clz”代表“计数前导零”,这是描述同一问题的另一种方式。


答案 4 :(得分:5)

查找BSR(位扫描反向)x86 asm指令,以便以最快的方式执行此操作。来自英特尔的文档: Searches the source operand (second operand) for the most significant set bit (1 bit). If a most significant 1 bit is found, its bit index is stored in the destination operand (first operand).

答案 5 :(得分:3)

答案 6 :(得分:2)


getmsb (unsigned long long x)
    int r = 0;
    if (x < 1) return 0;
    while (x >>= 1) r++;
    return r;

通过将输入指定为unsigned long long,它可以处理从unsigned charunsigned long long的所有数字类,并且在给定标准定义的情况下,它在x86_64和x86构建中兼容。 0的大小写定义为返回0,但可以根据需要进行更改。一个简单的测试和输出是:

main (int argc, char *argv[]) {

    unsigned char c0 = 0;
    unsigned char c = 216;
    unsigned short s = 1021;
    unsigned int ui = 32768;
    unsigned long ul = 3297381253;
    unsigned long long ull = 323543844043;

    int i = 32767;

    printf ("  %16u  MSB : %d\n", c0, getmsb (c0));
    printf ("  %16u  MSB : %d\n", c, getmsb (c));
    printf ("  %16u  MSB : %d\n", s, getmsb (s));
    printf ("  %16u  MSB : %d\n", i, getmsb (i));
    printf ("  %16u  MSB : %d\n", ui, getmsb (ui));
    printf ("  %16lu  MSB : %d\n", ul, getmsb (ul));
    printf ("  %16llu  MSB : %d\n", ull, getmsb (ull));

    return 0;


             0  MSB : 0
           216  MSB : 7
          1021  MSB : 9
         32767  MSB : 14
         32768  MSB : 15
    3297381253  MSB : 31
  323543844043  MSB : 38


答案 7 :(得分:2)

如果您正在使用x86,那么您可以使用SSE2操作实际上击败任何逐字节或逐字解决方案,并结合查找第一位指令(在gcc世界中)最低位发音为“ffs”,最高位为“fls”。 请原谅我在答案中遇到麻烦(!@#$%^)格式化“C”代码;查看: http://mischasan.wordpress.com/2011/11/03/sse2-bit-trick-ffsfls-for-xmm-registers/

答案 8 :(得分:1)


////// go.c ////////
#include <stdio.h>

unsigned NUM_BITS_U = ((sizeof(unsigned) << 3) - 1);
#define POS_OF_HIGHESTBITclz(a) (NUM_BITS_U - __builtin_clz(a)) /* only works for a != 0 */

#define NUM_OF_HIGHESTBITclz(a) ((a)                                \
                             ? (1U << POS_OF_HIGHESTBITclz(a))      \
                             : 0)

int main()
  unsigned ui;

  for (ui = 0U; ui < 18U; ++ui)
    printf("%i \t %i\n", ui, NUM_OF_HIGHESTBITclz(ui));

  return 0;

答案 9 :(得分:1)


但是不幸的是,没有可移植的内部函数有效地将它们公开给所有编译器。 GNU C提供了 <xsl:template match="body"> <xsl:for-each select="paragraph"> <fo:block space-after="1.4em"> <xsl:apply-templates select="d:htmlparse(., '', true())/node()"/> </fo:block> </xsl:for-each> <fox:external-document xmlns:fox="http://xmlgraphics.apache.org/fop/1.0/extensions" content-type="pdf" src='./tobedssssconv.pdf'/> </xsl:template> ,但是__builtin_clz并没有将当前的GCC和ICC的优化返回到仅BSR。 (它与clang一起使用,这证明了表达式是等效的,因此它可以可以)。

以下内容定义了unsigned bitidx = 31 - __builtin_clz(x);BSR32()宏或函数,这些宏或函数可以有效地编译为 just x86上的BSR64()指令。 (如果输入为零,则产生垃圾结果。内在函数无法利用asm指令的行为,使输入= 0的目标保持不变。)

要移植到非x86平台还需要花费额外的bsr ,例如退回到#ifdef。如果大多数非x86 ISA都具有前导零位,则可以计算前导零而不是为您提供位索引。这就是GNU C将31-__builtin_clz定义为可移植内置函数的原因。 (如果目标系统上没有硬件支持,则内置函数将编译为软件仿真,通常调用libgcc helper函数。)


#include <stdint.h> // define BSR32() and BSR64() #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #ifdef __INTEL_COMPILER typedef unsigned int bsr_idx_t; #else #include <intrin.h> // MSVC typedef unsigned long bsr_idx_t; #endif static inline unsigned BSR32(unsigned long x){ bsr_idx_t idx; _BitScanReverse(&idx, x); // ignore bool retval return idx; } static inline unsigned BSR64(uint64_t x) { bsr_idx_t idx; _BitScanReverse64(&idx, x); // ignore bool retval return idx; } #elif defined(__GNUC__) #ifdef __clang__ static inline unsigned BSR64(uint64_t x) { return 63-__builtin_clzll(x); // gcc/ICC can't optimize this back to just BSR, but clang can and doesn't provide alternate intrinsics } #else #define BSR64 __builtin_ia32_bsrdi #endif #include <x86intrin.h> #define BSR32(x) _bit_scan_reverse(x) #endif 对于编译器可能不需要太多帮助,因为内建函数与asm指令返回LSB的位索引(即尾随零的计数)的行为相匹配。

测试调用者bsf将其内联到所有主要x86编译器on the Godbolt compiler explorer上的1条指令中。 BSR64以相同的方式内联到64位操作数大小的版本。有关用例,另请参见Is there an x86/x86_64 instruction which zeros all bits below the Most Significant Bit?

unsigned test32(unsigned x) {  return BSR32(x);  }
;; x64 MSVC 19.16 -O2
unsigned int test32(unsigned int) PROC                                    ; test32, COMDAT
        bsr     eax, ecx
        ret     0
unsigned int test32(unsigned int) ENDP                                    ; test32
# clang -O3 -march=haswell   is too "smart?" for its own good:
test32(unsigned int):
        lzcnt   eax, edi
        xor     eax, 31
# gcc8.2 -O3 -march=haswell
test32(unsigned int):
        bsr     eax, edi


# ICC19 -O3 -march=haswell
test32(unsigned int):
        bsr       eax, edi                                      #15.9
        ret                                                     #41.12

如果没有#ifdef __GNUC__ unsigned badgcc(uint64_t x) { return 63 - __builtin_clzll(x); } #endif ,我们只会从c中获得BSR,但是:

# gcc8.2 -O3
badgcc(unsigned long):
        bsr     rdi, rdi
        mov     eax, 63
        xor     rdi, 63
        sub     eax, edi

那太讨厌了。 (有趣的是,如果输入为零,则ICC正在执行CMOV以产生# ICC19.0.1 -O3 badgcc(unsigned long): mov rax, -1 #46.17 bsr rdx, rdi #46.17 cmove rdx, rax #46.17 neg rdx #46.17 add rdx, 63 #46.17 neg edx #46.17 add edx, 63 #46.17 mov eax, edx #46.17 ret #46.17 。BSR根据其 input 设置ZF,这与大多数指令根据结果设置标志不同。)

使用-1(或通过其他方式启用BMI1指令),虽然还不错,但仍然不如BSR好。模输出依赖关系,编译器通常会为lzcnt避免,但奇怪的是对于BSR避免。 (由于input = 0的行为,输出依赖为 true 依赖。)Why does breaking the "output dependency" of LZCNT matter?

答案 10 :(得分:1)


typedef unsigned long long u64;
typedef unsigned int       u32;
typedef unsigned char      u8;

u8 findMostSignificantBit (u64 u64Val)
  u8 u8Shift;
  u8 u8Bit = 0;

  assert (u64Val != 0ULL);

  for (u8Shift = 32 ; u8Shift != 0 ; u8Shift >>= 1)
    u64 u64Temp = u64Val >> u8Shift;
    if (u64Temp)
      u8Bit |= u8Shift; // notice not using +=
      u64Val = u64Temp;

  return u8Bit;

当然,这是在64位数(无符号长long)上工作,而不是数组。此外,很多人都指出了我不知道的内置g ++函数。多么有趣。


我也使用| =而不是+ =因为它们总是2的幂,而OR(经典地)比加法更快。由于我只是将2的独特权力加在一起,所以我从来没有翻过来。



u8 findMostSignificantBit2 (u64 u64Val)
  assert (u64Val != 0ULL);

  return (u8) (__builtin_ctzll(u64Val));

答案 11 :(得分:1)


//// C program
#include <math.h>

#define POS_OF_HIGHESTBIT(a) /* 0th position is the Least-Signif-Bit */    \
((unsigned) log2(a))         /* thus: do not use if a <= 0 */  

#define NUM_OF_HIGHESTBIT(a) ((!(a))          \
        ? 0 /* no msb set*/                   \
        : (1 << POS_OF_HIGHESTBIT(a) ))
// could be changed and optimized, if it is known that the following NEVER holds: a <= 0

int main()
  unsigned a = 5; // 0b101
  unsigned b = NUM_OF_HIGHESTBIT(a); // 4 since 4 = 0b100
  return 0; 

答案 12 :(得分:1)



if (b>=0x10)
  if (b>=0x40)
    if (b>=0x80) return 0;
    else return 1;
    if (b>=0x20) return 2;
    else return 3;
  if (b>=0x4)
    if (b>=0x8) return 4;
    else return 5;
    if (b>=0x2) return 6;
    else return 7;





答案 13 :(得分:0)



#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>

int main()
    uint32_t test_value = 0x00a1;
    int i;

    for (i=0; i<32; ++i)
        if (test_value & (0x80000000 >> i))
            printf("i = %d\n", i);

    return 0;

答案 14 :(得分:0)


int msb( unsigned char x);  // prototype for function that returns 
                            //  most significant bit set

unsigned char* p;

for (p = arr + num_elements; p != arr;) {
    if (*p != 0) break;

// p is with pointing to the last byte that has a bit set, or
//  it's pointing to the first byte in the array

if (*p) {
    return ((p - arr) * 8) + msb( *p);

// what do you want to return if no bits are set?
return -1;

我会将其作为练习,让读者提出适当的msb()函数以及优化工作intlong long大小的数据接口

答案 15 :(得分:0)


static public final int msb(int n) {
    n |= n >>> 1;  
    n |= n >>> 2; 
    n |= n >>> 4; 
    n |= n >>> 8; 
    n |= n >>> 16; 
    n >>>= 1;
    n += 1; 
    return n;


static public final int msb_index(int n) {

    final int[] multiply_de_bruijn_bit_position = {
        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 
        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
    return multiply_de_bruijn_bit_position[(msb(n) * 0x077CB531) >>> 27];

答案 16 :(得分:-3)

#define FFS(t)  \
({ \
register int n = 0; \
if (!(0xffff & t)) \
    n += 16; \
if (!((0xff << n) & t)) \
    n += 8; \
if (!((0xf << n) & t)) \
    n += 4; \
if (!((0x3 << n) & t)) \
    n += 2; \
if (!((0x1 << n) & t)) \
    n += 1; \
n; \