好的,所以我正在尝试编写一个函数来反转C中的长(64字节),并且我的位移得到了一些奇怪的结果。
long reverse_long(long x) {
int i;
for(i=0; i<4; i++) {
long a = 0x00000000000000FF<<(8*i);
long b = 0xFF00000000000000>>(8*i);
a = x&a;
b = x&b;
a=a<<8*(7-2*i);
b=b>>8*(7-2*i);
x=x&(~(0x00000000000000FF<<(8*i)));
x=x&(~(0xFF00000000000000>>(8*i)));
x=x|a;
x=x|b;
}
}
在第4行(long a = 0x00000000000000FF<<(8*i)
)上,对于循环的每次迭代,我将一个1的字节向左移位8位,这适用于第一次,第二次和第三次迭代,但是在第四次迭代我得到类似0xFFFFFFFF000000
的内容,当我得到0x00000000FF000000
时。
第5行(long b = 0x00000000000000FF>>(8*i)
)工作得很好,并且给了我0x000000FF00000000
的值。
谁能告诉我这里发生了什么?
答案 0 :(得分:1)
a)关于你的错误:
你在那里做什么:
long a = 0x00000000000000FF<<(8*i);
b)关于你的代码: 编写函数有更简单的方法,例如:
unsigned long reverse_long(unsigned long x) {
unsigned long rc = 0;
int i = 8;
do {
rc = (rc << 8) | (unsigned char)x;
x >>= 8;
} while(--i);
return rc;
}
答案 1 :(得分:1)
要了解代码中的潜在问题,您需要了解以下事项:
~
的规则要记住很多事情。为了避免在long a = 0x00000000000000FF<<(8*i);
时处理各种奇怪的问题(例如,i == 3
导致未定义的行为),我建议如下:
x
)此外,您的代码假设long
是64位。这并非总是如此。最好做以下两件事之一:
unsigned long
,无论大小unsigned long
可能是uint64_t
代替long
简而言之,如果我们只修复与上面列出的点相关的错误(并且不更改算法),那么这就是您的代码应该如何看待:
uint64_t reverse_long(uint64_t x)
{
int i;
for(i=0; i<4; i++)
{
uint64_t a = 0xFFull << (8*i);
uint64_t b = 0xFF00000000000000ull >> (8*i);
a = x&a;
b = x&b;
a=a<<8*(7-2*i);
b=b>>8*(7-2*i);
x=x&(~(0xFFull<<(8*i)));
x=x&(~(0xFF00000000000000ull>>(8*i)));
x=x|a;
x=x|b;
}
return x; // don't forget this
}
注意:我使用ull
后缀来创建64位文字。实际上这只保证至少64位,但由于这里的所有内容都是无符号的,所以没有区别,多余的位将被截断。要非常精确,请写(uint64_t)0xFF
而不是0xFFull
等。
答案 2 :(得分:1)
您已经收到了关于代码出错的好建议,但我认为您可能希望看到一种可能更简单的替代方法。
uint64_t reverse_long(uint64_t n) {
uint8_t* a = (uint8_t*)&n;
uint8_t* b = a + 7;
while(a < b) {
uint8_t t = *b;
*b-- = *a;
*a++ = t;
}
}
答案 3 :(得分:1)
签名长期的正确转移是有问题的,因为它们是否定的。代码上的这个次要变体,仅对sizeof(long) == 8)
的64位计算机安全,可确保常量为long
,中间变量a
和b
为{ {1}}以避免这些问题。该代码包含大量诊断信息。
unsigned long
输出:
#include <stdio.h>
long reverse_long(long x);
long reverse_long(long x)
{
int i;
for (i = 0; i < 4; i++)
{
printf("x0 0x%.16lX\n", x);
unsigned long a = 0x00000000000000FFL << (8 * i);
unsigned long b = 0xFF00000000000000L >> (8 * i);
a &= x;
b &= x;
printf("a0 0x%.16lX; b0 0x%.16lX\n", a, b);
a <<= 8 * (7 - 2 * i);
b >>= 8 * (7 - 2 * i);
printf("a1 0x%.16lX; b1 0x%.16lX\n", a, b);
x &= (~(0x00000000000000FFL << (8 * i)));
x &= (~(0xFF00000000000000L >> (8 * i)));
printf("x1 0x%.16lX\n", x);
x |= a | b;
printf("x2 0x%.16lX\n", x);
}
return x;
}
int main(void)
{
long x = 0xFEDCBA9876543210L;
printf("0x%.16lX <=> 0x%.16lX\n", x, reverse_long(x));
return 0;
}
这是上述计划的变体,x0 0xFEDCBA9876543210
a0 0x0000000000000010; b0 0xFE00000000000000
a1 0x1000000000000000; b1 0x00000000000000FE
x1 0x00DCBA9876543200
x2 0x10DCBA98765432FE
x0 0x10DCBA98765432FE
a0 0x0000000000003200; b0 0x00DC000000000000
a1 0x0032000000000000; b1 0x000000000000DC00
x1 0x1000BA98765400FE
x2 0x1032BA987654DCFE
x0 0x1032BA987654DCFE
a0 0x0000000000540000; b0 0x0000BA0000000000
a1 0x0000540000000000; b1 0x0000000000BA0000
x1 0x103200987600DCFE
x2 0x1032549876BADCFE
x0 0x1032549876BADCFE
a0 0x0000000076000000; b0 0x0000009800000000
a1 0x0000007600000000; b1 0x0000000098000000
x1 0x1032540000BADCFE
x2 0x1032547698BADCFE
0xFEDCBA9876543210 <=> 0x1032547698BADCFE
已更改为reverse_long()
并使用reverse_uint64_v1()
而非uint64_t
和long
。打印使用unsigned long
格式进行升级,但由于它在性能测试中使用,因此也进行了注释。 PRIX64
函数每个周期执行的操作更少,但它会执行更多周期(8而不是4)。它将输入值左边的低位字节复制到当前输出值的低位字节后,将其向左移位8位。 reverse_uint64_v2()
函数执行reverse_uint64_v3()
的循环展开,并通过避免对reverse_uint64_v2()
的一次分配和最后一次额外的转移进行微优化。
b
示例输出:
#include <inttypes.h>
#include <stdio.h>
#include "timer.h"
uint64_t reverse_uint64_v1(uint64_t x);
uint64_t reverse_uint64_v2(uint64_t x);
uint64_t reverse_uint64_v3(uint64_t x);
uint64_t reverse_uint64_v1(uint64_t x)
{
for (int i = 0; i < 4; i++)
{
//printf("x0 0x%.16" PRIX64 "\n", x);
uint64_t a = UINT64_C(0x00000000000000FF) << (8 * i);
uint64_t b = UINT64_C(0xFF00000000000000) >> (8 * i);
a &= x;
b &= x;
//printf("a0 0x%.16" PRIX64 "; b0 0x%.16" PRIX64 "\n", a, b);
a <<= 8 * (7 - 2 * i);
b >>= 8 * (7 - 2 * i);
//printf("a1 0x%.16" PRIX64 "; b1 0x%.16" PRIX64 "\n", a, b);
x &= ~(UINT64_C(0x00000000000000FF) << (8 * i));
x &= ~(UINT64_C(0xFF00000000000000) >> (8 * i));
//printf("x1 0x%.16" PRIX64 "\n", x);
x |= a | b;
//printf("x2 0x%.16" PRIX64 "\n", x);
}
return x;
}
uint64_t reverse_uint64_v2(uint64_t x)
{
uint64_t r = 0;
for (size_t i = 0; i < sizeof(uint64_t); i++)
{
uint64_t b = x & 0xFF;
r = (r << 8) | b;
x >>= 8;
}
return r;
}
uint64_t reverse_uint64_v3(uint64_t x)
{
uint64_t b;
uint64_t r;
r = x & 0xFF; // Optimization 1
x >>= 8;
b = x & 0xFF;
r = (r << 8) | b;
x >>= 8;
b = x & 0xFF;
r = (r << 8) | b;
x >>= 8;
b = x & 0xFF;
r = (r << 8) | b;
x >>= 8;
b = x & 0xFF;
r = (r << 8) | b;
x >>= 8;
b = x & 0xFF;
r = (r << 8) | b;
x >>= 8;
b = x & 0xFF;
r = (r << 8) | b;
x >>= 8;
b = x & 0xFF;
r = (r << 8) | b;
// x >>= 8; // Optimization 2
return r;
}
static void timing_test(uint64_t (*reverse)(uint64_t))
{
Clock clk;
clk_init(&clk);
uint64_t ur = 0;
uint64_t lb = UINT64_C(0x0123456789ABCDEF);
uint64_t ub = UINT64_C(0xFEDCBA9876543210);
uint64_t inc = UINT64_C(0x287654321);
uint64_t cnt = 0;
clk_start(&clk);
for (uint64_t u = lb; u < ub; u += inc)
{
ur += (*reverse)(u);
cnt++;
}
clk_stop(&clk);
char buffer[32];
printf("Sum = 0x%.16" PRIX64 " Count = %" PRId64 " Time = %s\n", ur, cnt,
clk_elapsed_us(&clk, buffer, sizeof(buffer)));
}
int main(void)
{
uint64_t u = UINT64_C(0xFEDCBA9876543210);
printf("0x%.16" PRIX64 " <=> 0x%.16" PRIX64 "\n", u, reverse_uint64_v1(u));
printf("0x%.16" PRIX64 " <=> 0x%.16" PRIX64 "\n", u, reverse_uint64_v2(u));
printf("0x%.16" PRIX64 " <=> 0x%.16" PRIX64 "\n", u, reverse_uint64_v3(u));
timing_test(reverse_uint64_v1);
timing_test(reverse_uint64_v2);
timing_test(reverse_uint64_v3);
timing_test(reverse_uint64_v1);
timing_test(reverse_uint64_v2);
timing_test(reverse_uint64_v3);
return 0;
}
总和和计数有两个目的。首先,它们交叉检查三个函数的结果是否相同。其次,它们确保编译器不会做任何事情,例如优化整个业务循环。
正如您所看到的,0xFEDCBA9876543210 <=> 0x1032547698BADCFE
0xFEDCBA9876543210 <=> 0x1032547698BADCFE
0xFEDCBA9876543210 <=> 0x1032547698BADCFE
Sum = 0x0BC6E4692C2EC35A Count = 1683264863 Time = 8.543540
Sum = 0x0BC6E4692C2EC35A Count = 1683264863 Time = 6.822616
Sum = 0x0BC6E4692C2EC35A Count = 1683264863 Time = 7.303825
Sum = 0x0BC6E4692C2EC35A Count = 1683264863 Time = 8.943668
Sum = 0x0BC6E4692C2EC35A Count = 1683264863 Time = 7.314660
Sum = 0x0BC6E4692C2EC35A Count = 1683264863 Time = 7.295862
和v2
时间之间没有太大区别,但v3
代码比v1
代码慢得多或v2
代码。为清楚起见,我使用v3
代码。
为了比较,我还添加了“什么都不做”。功能:
v2
显然,这个的总和是不同的,但计数是相同的,所以它测量循环控制和计数的开销。我两次跑的次数是:
uint64_t reverse_uint64_v4(uint64_t x)
{
return x;
}
显然,测试函数中大约一半的时间是在循环和函数调用开销中。