我正在尝试将8位值复制为32位,并想问问是否可以编写单行算法来复制位值。
例如:
1100 1011 -> 1111 1111 0000 0000 1111 0000 1111 1111
如果可能,我想了解其背后的逻辑是什么。
答案 0 :(得分:6)
只有256个8位值,因此一个简单的查找表将占用1kb,并且查找是微不足道的。很难相信任何bithack都会具有出色的性能。
答案 1 :(得分:5)
很简单-解决最简单的情况,然后再处理更复杂的情况。
您只需要在此之间插入3个零位来扩展位。完成此操作后,最后一步是:
x = (x << 0) | (x << 1) | (x << 2) | (x << 3);
或者以一种不太明显但更快的方式:
x = (x << 4) - x;
情况1:将2位复制为8位值(最简单)。
+---+---------+---------+
| 0 | _ _ _ _ | _ _ A B |
+---+---------+---------+
| 1 | _ _ _ A | _ _ _ B |
+---+---------+---------+
| 2 | A A A A | B B B B |
+---+---------+---------+
情况2:将4位复制为16位值。怎么样?只需将2位移至上部,即可将其变成表壳1!分而治之!
+---+---------+---------+---------+---------+
| 0 | _ _ _ _ | _ _ _ _ | _ _ _ _ | A B C D |
+---+---------+---------+---------+---------+
| 1 | _ _ _ _ | _ _ A B | _ _ _ _ | _ _ C D |
+---+---------+---------+---------+---------+
| 2 | _ _ _ A | _ _ _ B | _ _ _ C | _ _ _ D |
+---+---------+---------+---------+---------+
| 3 | A A A A | B B B B | C C C C | D D D D |
+---+---------+---------+---------+---------+
情况3:将8位复制为32位值(原始值)。
+---+---------+---------+---------+---------+---------+---------+---------+---------+
| 0 | _ _ _ _ | _ _ _ _ | _ _ _ _ | _ _ _ _ | _ _ _ _ | _ _ _ _ | A B C D | E F G H |
+---+---------+---------+---------+---------+---------+---------+---------+---------+
| 1 | _ _ _ _ | _ _ _ _ | _ _ _ _ | A B C D | _ _ _ _ | _ _ _ _ | _ _ _ _ | E F G H |
+---+---------+---------+---------+---------+---------+---------+---------+---------+
| 2 | _ _ _ _ | _ _ A B | _ _ _ _ | _ _ C D | _ _ _ _ | _ _ E F | _ _ _ _ | _ _ G H |
+---+---------+---------+---------+---------+---------+---------+---------+---------+
| 3 | _ _ _ A | _ _ _ B | _ _ _ C | _ _ _ D | _ _ _ E | _ _ _ F | _ _ _ G | _ _ _ H |
+---+---------+---------+---------+---------+---------+---------+---------+---------+
| 4 | A A A A | B B B B | C C C C | D D D D | E E E E | F F F F | G G G G | H H H H |
+---+---------+---------+---------+---------+---------+---------+---------+---------+
可以通过以下代码实现:
uint32_t interleave(uint8_t value)
{
uint32_t x = value;
x = (x | (x << 12)) /* & 0x000F000F */; // GCC is not able to remove redundant & here
x = (x | (x << 6)) & 0x03030303;
x = (x | (x << 3)) & 0x11111111;
x = (x << 4) - x;
return x;
}
一些测试用例,以检查其是否有效:
TEST_F(test, interleave)
{
EXPECT_EQ(interleave(0x00), 0x00000000);
EXPECT_EQ(interleave(0x11), 0x000F000F);
EXPECT_EQ(interleave(0x22), 0x00F000F0);
EXPECT_EQ(interleave(0x33), 0x00FF00FF);
EXPECT_EQ(interleave(0x44), 0x0F000F00);
EXPECT_EQ(interleave(0x55), 0x0F0F0F0F);
EXPECT_EQ(interleave(0x66), 0x0FF00FF0);
EXPECT_EQ(interleave(0x77), 0x0FFF0FFF);
EXPECT_EQ(interleave(0x88), 0xF000F000);
EXPECT_EQ(interleave(0x99), 0xF00FF00F);
EXPECT_EQ(interleave(0xAA), 0xF0F0F0F0);
EXPECT_EQ(interleave(0xBB), 0xF0FFF0FF);
EXPECT_EQ(interleave(0xCC), 0xFF00FF00);
EXPECT_EQ(interleave(0xDD), 0xFF0FFF0F);
EXPECT_EQ(interleave(0xEE), 0xFFF0FFF0);
EXPECT_EQ(interleave(0xFF), 0xFFFFFFFF);
EXPECT_EQ(interleave(0x01), 0x0000000F);
EXPECT_EQ(interleave(0x23), 0x00F000FF);
EXPECT_EQ(interleave(0x45), 0x0F000F0F);
EXPECT_EQ(interleave(0x67), 0x0FF00FFF);
EXPECT_EQ(interleave(0x89), 0xF000F00F);
EXPECT_EQ(interleave(0xAB), 0xF0F0F0FF);
EXPECT_EQ(interleave(0xCD), 0xFF00FF0F);
EXPECT_EQ(interleave(0xEF), 0xFFF0FFFF);
}
答案 2 :(得分:3)
这将起作用:
unsigned int eToTW (unsigned char a) {
unsigned int output = 0;
output |= a & 0x80 ? ((unsigned) 0xf) << 28 : 0x0;
output |= a & 0x40 ? 0xf << 24 : 0x0;
output |= a & 0x20 ? 0xf << 20 : 0x0;
output |= a & 0x10 ? 0xf << 16 : 0x0;
output |= a & 0x8 ? 0xf << 12 : 0x0;
output |= a & 0x4 ? 0xf << 8 : 0x0;
output |= a & 0x2 ? 0xf << 4 : 0x0;
output |= a & 0x1 ? 0xf : 0x0;
return output;
}
或者这个:
unsigned int eToTW (unsigned char a) {
unsigned int output = 0;
output |= a & (1 << 7) ? ((unsigned) 0xf) << 28 : 0x0;
output |= a & (1 << 6) ? 0xf << 24 : 0x0;
output |= a & (1 << 5) ? 0xf << 20 : 0x0;
output |= a & (1 << 4) ? 0xf << 16 : 0x0;
output |= a & (1 << 3) ? 0xf << 12 : 0x0;
output |= a & (1 << 2) ? 0xf << 8 : 0x0;
output |= a & (1 << 1) ? 0xf << 4 : 0x0;
output |= a & 1 ? 0xf : 0x0;
return output;
}
另一个解决方案:
unsigned int eToTW (unsigned char a) {
return (a & 1 << 7 ? ((unsigned) 0xf) << 28 : 0x0) |
(a & 1 << 6 ? 0xf << 24 : 0x0) |
(a & 1 << 5 ? 0xf << 20 : 0x0) |
(a & 1 << 4 ? 0xf << 16 : 0x0) |
(a & 1 << 3 ? 0xf << 12 : 0x0) |
(a & 1 << 2 ? 0xf << 8 : 0x0) |
(a & 1 << 1 ? 0xf << 4 : 0x0) |
(a & 1 ? 0xf : 0x0);
}
答案 3 :(得分:3)
answer by rici中建议的查找表将在大多数平台上提供最高的性能。如果您更喜欢使用位旋转方法,则最佳解决方案将取决于处理器的硬件功能,例如移位有多快?它具有三输入逻辑运算(例如我的GPU)吗?它可以并行执行多少个整数指令?一种解决方案是将每个位传输到其目标半字节的lsb,然后在第二步中用其lsb值填充每个半字节(chqrlie的一角表示建议使用lsb而不是msb) :
#include <stdint.h>
uint32_t expand_bits_to_nibbles (uint8_t x)
{
uint32_t r;
/* spread bits to lsb in each nibble */
r = ((((uint32_t)x << (4*0-0)) & (1u << (4*0))) |
(((uint32_t)x << (4*1-1)) & (1u << (4*1))) |
(((uint32_t)x << (4*2-2)) & (1u << (4*2))) |
(((uint32_t)x << (4*3-3)) & (1u << (4*3))) |
(((uint32_t)x << (4*4-4)) & (1u << (4*4))) |
(((uint32_t)x << (4*5-5)) & (1u << (4*5))) |
(((uint32_t)x << (4*6-6)) & (1u << (4*6))) |
(((uint32_t)x << (4*7-7)) & (1u << (4*7))));
/* fill in nibbles */
r = (r << 4) - r;
return r;
}
使用Compiler Explorer进行的一些快速实验表明,例如,这导致在PowerPC64上出现particularly efficient code。
如果处理器具有快速整数乘法器,我们可以使用它同时将多个位移位。在这里,我们希望使用三个源位的组来避免冲突:
#include <stdint.h>
uint32_t expand_bits_to_nibbles_mul (uint8_t x)
{
const uint32_t spread3 = (1u << 6) | (1u << 3) | (1u << 0);
const uint8_t bits_lo3 = (1u << 2) | (1u << 1) | (1u << 0);
const uint8_t bits_md3 = (1u << 5) | (1u << 4) | (1u << 3);
const uint8_t bits_hi2 = (1u << 7) | (1u << 6);
const uint32_t nib_lsb = (1u << 28) | (1u << 24) | (1u << 20) | (1u << 16) |
(1u << 12) | (1u << 8) | (1u << 4) | (1u << 0);
uint32_t r;
/* spread bits to lsb in each nibble */
r = (((uint32_t)(x & bits_lo3) * (spread3 << 0)) +
((uint32_t)(x & bits_md3) * (spread3 << 9)) +
((uint32_t)(x & bits_hi2) * (spread3 << 18))) & nib_lsb;
/* fill in nibbles */
r = (r << 4) - r;
return r;
}
另一个使用整数乘法的变体,在某些平台上可能更快,它使用了this answer的想法。我们使用乘法来一次扩展四个位,以使它们落入目标半字节内。但是,我们必须先将半字节内的位移到半字节的lsb,然后才能扩展lsb来覆盖半字节。我们可能会节省乘法运算,但会增加额外的内部管理工作。
#include <stdint.h>
uint32_t expand_bits_to_nibbles_mul2 (uint8_t x)
{
const uint32_t spread4 = (1u << 12) | (1u << 8) | (1u << 4) | (1u << 0);
const uint32_t extract = (1u << (3*4+3+16)) | (1u << (2*4+2+16)) |
(1u << (1*4+1+16)) | (1u << (0*4+0+16)) |
(1u << (3*4+3+ 0)) | (1u << (2*4+2+ 0)) |
(1u << (1*4+1+ 0)) | (1u << (0*4+0+ 0));
const uint32_t nib_lsb = (1u << 28) | (1u << 24) | (1u << 20) | (1u << 16) |
(1u << 12) | (1u << 8) | (1u << 4) | (1u << 0);
const uint32_t nib_msb = (nib_lsb << 3);
const uint8_t bits_lo4 = (1u << 3) | (1u << 2) | (1u << 1) | (1u << 0);
const uint8_t bits_hi4 = (1u << 7) | (1u << 6) | (1u << 5) | (1u << 4);
uint32_t r;
/* spread bits to their target nibbles */
r = (((uint32_t)(x & bits_lo4) * (spread4 << 0)) +
((uint32_t)(x & bits_hi4) * (spread4 << 12)));
/* extract appropriate bit in each nibble and move it into nibble's lsb */
r = (((r & extract) + (nib_msb - extract)) >> 3) & nib_lsb;
/* fill in each nibble with its lsb */
r = (r << 4) - r;
return r;
}