我的c代码中有几个uint8_t数组,我想比较一个与另一个的任意序列位。例如,我有bitarray_1和bitarray_2,我想比较bitarray_1的bit 13 - 47和bitarray_2的5-39位。最有效的方法是什么?
目前这是我程序中的一个巨大瓶颈,因为我只是有一个简单的实现,将这些位复制到一个新的临时数组的开头,然后对它们使用memcmp。
答案 0 :(得分:4)
三个字:shift,mask和xor。
转换为两个bitarray获得相同的内存对齐。如果不是,您必须在比较之前移动其中一个阵列。您的示例可能会产生误导,因为位13-47和5-39在8位地址上具有相同的内存对齐。如果您将位14-48与位5-39进行比较,则情况并非如此。
一旦所有内容都对齐并且超出了表边界的位,xor就足以一次执行所有位的比较。基本上你可以设法只为每个数组读取一个内存,这应该非常有效。
如果两个数组的内存对齐方式与示例memcmp相同,则上限和下限的特殊情况可能更快。
同样通过uint32_t(或64位架构上的uint64_t)访问数组也应该比通过uint8_t访问更有效。
原则很简单,但正如Andrejs所说,实施并非无痛......
这是怎么回事(与@caf提案的相似之处并非巧合):
/* compare_bit_sequence() */
int compare_bit_sequence(uint8_t s1[], unsigned s1_off, uint8_t s2[], unsigned s2_off,
unsigned length)
{
const uint8_t mask_lo_bits[] =
{ 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t clear_lo_bits[] =
{ 0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 };
uint8_t v1;
uint8_t * max_s1;
unsigned end;
uint8_t lsl;
uint8_t v1_mask;
int delta;
/* Makes sure the offsets are less than 8 bits */
s1 += s1_off >> 3;
s1_off &= 7;
s2 += s2_off >> 3;
s2_off &= 7;
/* Make sure s2 is the sequence with the shorter offset */
if (s2_off > s1_off){
uint8_t * tmp_s;
unsigned tmp_off;
tmp_s = s2; s2 = s1; s1 = tmp_s;
tmp_off = s2_off; s2_off = s1_off; s1_off = tmp_off;
}
delta = s1_off;
/* handle the beginning, s2 incomplete */
if (s2_off > 0){
delta = s1_off - s2_off;
v1 = delta
? (s1[0] >> delta | s1[1] << (8 - delta)) & clear_lo_bits[delta]
: s1[0];
if (length <= 8 - s2_off){
if ((v1 ^ *s2)
& clear_lo_bits[s2_off]
& mask_lo_bits[s2_off + length]){
return NOT_EQUAL;
}
else {
return EQUAL;
}
}
else{
if ((v1 ^ *s2) & clear_lo_bits[s2_off]){
return NOT_EQUAL;
}
length -= 8 - s2_off;
}
s1++;
s2++;
}
/* main loop, we test one group of 8 bits of v2 at each loop */
max_s1 = s1 + (length >> 3);
lsl = 8 - delta;
v1_mask = clear_lo_bits[delta];
while (s1 < max_s1)
{
if ((*s1 >> delta | (*++s1 << lsl & v1_mask)) ^ *s2++)
{
return NOT_EQUAL;
}
}
/* last group of bits v2 incomplete */
end = length & 7;
if (end && ((*s2 ^ *s1 >> delta) & mask_lo_bits[end]))
{
return NOT_EQUAL;
}
return EQUAL;
}
尚未使用所有可能的优化措施。一个有希望的是使用更大的数据块(一次64位或32位而不是8位),您还可以检测两个阵列的偏移同步的情况,在这种情况下使用memcmp而不是主循环,替换逻辑运算符&amp; modulos%8 7,用'&gt;'替换'/ 8'&gt; 3'等等,必须分支代码而不是交换s1和s2等,但主要目的是实现:每个数组项只有一个内存读取而不是内存写入,因此大部分工作可以在处理器寄存器内部进行
答案 1 :(得分:1)
bitarray_1
的第13 - 47位与bitarray_1 + 1
的第5 - 39位相同
将前3位(5 - 7)与掩码进行比较,将其他位(8 - 39)与memcmp()
进行比较。
不是移位和复制这些位,而是以不同方式表示它们更快。你必须衡量。
/* code skeleton */
static char bitarray_1_bis[BIT_ARRAY_SIZE*8+1];
static char bitarray_2_bis[BIT_ARRAY_SIZE*8+1];
static const char *lookup_table[] = {
"00000000", "00000001", "00000010" /* ... */
/* 256 strings */
/* ... */ "11111111"
};
/* copy every bit of bitarray_1 to an element of bitarray_1_bis */
for (k = 0; k < BIT_ARRAY_SIZE; k++) {
strcpy(bitarray_1_bis + 8*k, lookup_table[bitarray_1[k]]);
strcpy(bitarray_2_bis + 8*k, lookup_table[bitarray_2[k]]);
}
memcmp(bitarray_1_bis + 13, bitarray_2_bis + 5, 47 - 13 + 1);
您可以(并且应该)将副本限制在尽可能小的范围内。
我不知道它是否更快,但如果是的话我也不会感到惊讶。同样,你必须衡量。
答案 2 :(得分:1)
最简单的方法是将更复杂的案例转换为更简单的案例,然后解决更简单的案例。
在下面的代码中,do_compare()
解决了更简单的情况(序列永远不会偏移超过7位,s1
总是偏移s2
,并且序列的长度不为零)。然后compare_bit_sequence()
函数负责将更难的案例转换为更容易的案例,并调用do_compare()
来完成工作。
这只是通过位序列进行单次传递,所以希望这是对copy-and-memcmp实现的改进。
#define NOT_EQUAL 0
#define EQUAL 1
/* do_compare()
*
* Does the actual comparison, but has some preconditions on parameters to
* simplify things:
*
* length > 0
* 8 > s1_off >= s2_off
*/
int do_compare(const uint8_t s1[], const unsigned s1_off, const uint8_t s2[],
const unsigned s2_off, const unsigned length)
{
const uint8_t mask_lo_bits[] =
{ 0xff, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t mask_hi_bits[] =
{ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
const unsigned msb = (length + s1_off - 1) / 8;
const unsigned s2_shl = s1_off - s2_off;
const unsigned s2_shr = 8 - s2_shl;
unsigned n;
uint8_t s1s2_diff, lo_bits = 0;
for (n = 0; n <= msb; n++)
{
/* Shift s2 so it is aligned with s1, pulling in low bits from
* the high bits of the previous byte, and store in s1s2_diff */
s1s2_diff = lo_bits | (s2[n] << s2_shl);
/* Save the bits needed to fill in the low-order bits of the next
* byte. HERE BE DRAGONS - since s2_shr can be 8, this below line
* only works because uint8_t is promoted to int, and we know that
* the width of int is guaranteed to be >= 16. If you change this
* routine to work with a wider type than uint8_t, you will need
* to special-case this line so that if s2_shr is the width of the
* type, you get lo_bits = 0. Don't say you weren't warned. */
lo_bits = s2[n] >> s2_shr;
/* XOR with s1[n] to determine bits that differ between s1 and s2 */
s1s2_diff ^= s1[n];
/* Look only at differences in the high bits in the first byte */
if (n == 0)
s1s2_diff &= mask_hi_bits[8 - s1_off];
/* Look only at differences in the low bits of the last byte */
if (n == msb)
s1s2_diff &= mask_lo_bits[(length + s1_off) % 8];
if (s1s2_diff)
return NOT_EQUAL;
}
return EQUAL;
}
/* compare_bit_sequence()
*
* Adjusts the parameters to match the preconditions for do_compare(), then
* calls it to do the work.
*/
int compare_bit_sequence(const uint8_t s1[], unsigned s1_off,
const uint8_t s2[], unsigned s2_off, unsigned length)
{
/* Handle length zero */
if (length == 0)
return EQUAL;
/* Makes sure the offsets are less than 8 bits */
s1 += s1_off / 8;
s1_off %= 8;
s2 += s2_off / 8;
s2_off %= 8;
/* Make sure s2 is the sequence with the shorter offset */
if (s1_off >= s2_off)
return do_compare(s1, s1_off, s2, s2_off, length);
else
return do_compare(s2, s2_off, s1, s1_off, length);
}
要在您的示例中进行比较,请致电:
compare_bit_sequence(bitarray_1, 13, bitarray_2, 5, 35)
(注意,我将这些位编号为零,假设位阵列是小端的,所以这将从bitarray2 [0]中的第六个最低位开始比较,第六个 - bitarray1 [1]中的最不重要的位。
答案 3 :(得分:0)
如何编写将计算两个数组的偏移量的函数,应用掩码,移位位并将结果存储到int中,以便比较它们。如果位数(在您的示例中为34)超过了int - recurse或loop的长度。
很抱歉,这个例子很痛苦。
答案 4 :(得分:0)
这是我未经优化的位序列比较函数:
#include <stdio.h>
#include <stdint.h>
// 01234567 01234567
uint8_t bitsA[] = { 0b01000000, 0b00010000 };
uint8_t bitsB[] = { 0b10000000, 0b00100000 };
int bit( uint8_t *bits, size_t bitpoz, size_t len ){
return (bitpoz<len)? !!(bits[bitpoz/8]&(1<<(7-bitpoz%8))): 0;
}
int bitcmp( uint8_t *bitsA, size_t firstA, size_t lenA,
uint8_t *bitsB, size_t firstB, size_t lenB ){
int cmp;
for( size_t i=0; i<lenA || i<lenB; i++ ){
if( (cmp = bit(bitsA,firstA+i,firstA+lenA) -
bit(bitsB,firstB+i,firstB+lenB)) ) return cmp;
}
return 0;
}
int main(){
printf( "cmp: %i\n", bitcmp( bitsA,1,11, bitsB,0,11 ) );
}
编辑:这是我的(未经测试的)bitstring相等测试函数:
#include <stdlib.h>
#include <stdint.h>
#define load_64bit(bits,first) (*(uint64_t*)bits<<first | *(bits+8)>>(8-first))
#define load_32bit(bits,first) (*(uint32_t*)bits<<first | *(bits+4)>>(8-first))
#define load_16bit(bits,first) (*(uint16_t*)bits<<first | *(bits+2)>>(8-first))
#define load_8bit( bits,first) ( *bits<<first | *(bits+1)>>(8-first))
static inline uint8_t last_bits( uint8_t *bits, size_t first, size_t size ){
return (first+size>8?load_8bit(bits,first):*bits<<first)>>(8-size);
}
int biteq( uint8_t *bitsA, size_t firstA,
uint8_t *bitsB, size_t firstB, size_t size ){
if( !size ) return 1;
bitsA+=firstA/8; firstA%=8;
bitsB+=firstB/8; firstB%=8;
for(; size>64;size-=64,bitsA+=8,bitsB+=8)
if(load_64bit(bitsA,firstA)!=load_64bit(bitsB,firstB)) return 0;
for(; size>32;size-=32,bitsA+=4,bitsB+=4)
if(load_32bit(bitsA,firstA)!=load_32bit(bitsB,firstB)) return 0;
for(; size>16;size-=16,bitsA+=2,bitsB+=2)
if(load_16bit(bitsA,firstA)!=load_16bit(bitsB,firstB)) return 0;
for(; size> 8;size-= 8,bitsA++, bitsB++ )
if(load_8bit( bitsA,firstA)!=load_8bit( bitsB,firstB)) return 0;
return !size ||
last_bits(bitsA,firstA,size)==last_bits(bitsB,firstB,size);
}
我制作了一个简单的测量工具,看它有多快:
#include <unistd.h>
#include <stdio.h>
#include <signal.h>
#define SIZE 1000000
uint8_t bitsC[SIZE];
volatile int end_loop;
void sigalrm_hnd( int sig ){ (void)sig; end_loop=1; }
int main(){
uint64_t loop_count; int cmp;
signal(SIGALRM,sigalrm_hnd);
loop_count=0; end_loop=0; alarm(10);
while( !end_loop ){
for( int i=1; i<7; i++ ){
loop_count++;
cmp = biteq( bitsC,i, bitsC,7-i,(SIZE-1)*8 );
if( !cmp ){ printf( "cmp: %i (==0)\n", cmp ); return -1; }
}
}
printf( "biteq: %.2f round/sec\n", loop_count/10.0 );
}
结果:
bitcmp: 8.40 round/sec
biteq: 363.60 round/sec
EDIT2:last_bits()已更改。