根据“如何在英特尔®IA-32和IA-64指令集上对代码执行时间进行基准测试” 架构“,我使用下面的代码:
static inline uint64_t bench_start(void)
{
unsigned cycles_low, cycles_high;
asm volatile("CPUID\n\t"
"RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
: "=r" (cycles_high), "=r" (cycles_low)
::"%rax", "%rbx", "%rcx", "%rdx");
return (uint64_t) cycles_high << 32 | cycles_low;
}
static inline uint64_t bench_end(void)
{
unsigned cycles_low, cycles_high;
asm volatile("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t"
: "=r" (cycles_high), "=r" (cycles_low)
::"%rax", "%rbx", "%rcx", "%rdx");
return (uint64_t) cycles_high << 32 | cycles_low;
}
但事实上,我也看到有人使用下面的代码:
static inline uint64_t bench_start(void)
{
unsigned cycles_low, cycles_high;
asm_volatile("RDTSCP\n\t"
: "=d" (cycles_high), "=a" (cycles_low));
return (uint64_t) cycles_high << 32 | cycles_low;
}
static inline uint64_t bench_start(void)
{
unsigned cycles_low, cycles_high;
asm_volatile("RDTSCP\n\t"
: "=d" (cycles_high), "=a" (cycles_low));
return (uint64_t) cycles_high << 32 | cycles_low;
}
如您所知,RDTSCP是伪序列化,为什么有人使用第二个代码?我猜两个原因,如下:
也许在大多数情况下,RDTSCP可以确保完整的“按顺序排除”?
也许只是想避免使用CPUID来提高效率?