考虑一个64位无符号整数,该整数在一个可除以4的位置上恰好包含一个0000b值的半字节。
是否存在次线性算法,即优于提取该零字节的位置的O(16)算法? SIMD解决方案也是可以接受的。
答案 0 :(得分:1)
一种方法是使用Alan Mycroft的空字节检测算法的变体。包含零的字节将变为0x80
,其他字节将变为0x00.
可以通过调整掩码对其进行微不足道的修改,以处理半字节而不是字节。然后,使用Posix函数ffsll()
,我们可以找到第一个置位的位并对位索引进行必要的调整,因为ffsll()
使用的是基于位的位,而Mycroft算法则标记了最高位。零进位而不是最低有效位。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
/* Adapted from Alan Mycroft's null-byte detection algorithm
newsgroup comp.lang.c, 1987/04/08,
https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ)
*/
int zero_nibble_position (uint64_t a)
{
const uint64_t nibble_lsb = 0x1111111111111111ULL;
const uint64_t nibble_msb = 0x8888888888888888ULL;
uint64_t t = (a - nibble_lsb) & (~a & nibble_msb);
return (t) ? (ffsll (t) - 4) : -1;
}
int zero_nibble_position_ref (uint64_t a)
{
if (!(a & (0xfULL << 0))) return 0;
if (!(a & (0xfULL << 4))) return 4;
if (!(a & (0xfULL << 8))) return 8;
if (!(a & (0xfULL << 12))) return 12;
if (!(a & (0xfULL << 16))) return 16;
if (!(a & (0xfULL << 20))) return 20;
if (!(a & (0xfULL << 24))) return 24;
if (!(a & (0xfULL << 28))) return 28;
if (!(a & (0xfULL << 32))) return 32;
if (!(a & (0xfULL << 36))) return 36;
if (!(a & (0xfULL << 40))) return 40;
if (!(a & (0xfULL << 44))) return 44;
if (!(a & (0xfULL << 48))) return 48;
if (!(a & (0xfULL << 52))) return 52;
if (!(a & (0xfULL << 56))) return 56;
if (!(a & (0xfULL << 60))) return 60;
return -1;
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <gmars...@gmail.com>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
int main (void)
{
for (int i = 0; i < 1000000000; i++) {
uint64_t a = KISS64;
int res = zero_nibble_position (a);
int ref = zero_nibble_position_ref (a);
if (res != ref) {
printf ("a=%016llx res=%d ref=%d\n", a, res, ref);
return EXIT_FAILURE;
}
}
return EXIT_SUCCESS;
}
如果您的平台不支持POSIX函数ffsll()
,则可以改用编译器特定的内置函数,例如gcc的__builtin_ctz()
,MSVC的_BitScanForward64()
或Intel编译器的{{ 3}},使用内联汇编来访问传递尾随零的计数的机器指令,或者自己滚动例如像这样:
int clzll (uint64_t a)
{
uint64_t r = 64;
if (a >= 0x100000000ULL) { a >>= 32; r -= 32; }
if (a >= 0x000010000ULL) { a >>= 16; r -= 16; }
if (a >= 0x000000100ULL) { a >>= 8; r -= 8; }
if (a >= 0x000000010ULL) { a >>= 4; r -= 4; }
if (a >= 0x000000004ULL) { a >>= 2; r -= 2; }
r -= a - (a & (a >> 1));
return r;
}
int ffsll (uint64_t a)
{
return 64 - clzll(a & -a);
}
由于这里我们不需要完全通用的ffsll()
实现,因此还可以基于_tzcnt_u64()
构建更快的变体,该变体使用寄存器内查找表:
/* return the position of a single set bit at (one-based) position n*4 */
int bit_pos (uint64_t a)
{
const uint64_t magic_multiplier =
(( 0ULL << 60) | ( 1ULL << 56) | ( 2ULL << 52) | ( 3ULL << 48) |
( 4ULL << 44) | ( 5ULL << 40) | ( 6ULL << 36) | ( 7ULL << 32) |
( 8ULL << 28) | ( 9ULL << 24) | (10ULL << 20) | (11ULL << 16) |
(12ULL << 12) | (13ULL << 8) | (14ULL << 4) | (15ULL << 0));
return (int)((((a >> 3) * magic_multiplier) >> 60) * 4 + 4);
}
/* special version for MSBs of nibbles only! */
int ffsll (uint64_t a)
{
#if NEVER_MORE_THAN_ONE_ZERO_NIBBLE
/* find the position of the only bit set */
return bit_pos (a);
#else // NEVER_MORE_THAN_ONE_ZERO_NIBBLE
/* isolate least significant set bit and find its position */
return bit_pos (a & -a);
#endif // NEVER_MORE_THAN_ONE_ZERO_NIBBLE
}