关于MSR IA32_TIME_STAMP_COUNTER(10h): 它遵循哪种序列化规则? rdtsc或rdtscp?还是其他?
如果没有序列化,我应该提供一个cpuid"屏障"在进行任何数学计算之前?
- 修改 -
到目前为止,我已经实施了两种障碍:cpuid和fences。
使用 cpuid :
#define RDCOUNTER(_val, _cnt) \
asm volatile \
( \
"xorq %%rax, %%rax \n\t" \
"cpuid \n\t" \
"movq %1, %%rcx \n\t" \
"rdmsr \n\t" \
"push %%rax \n\t" \
"push %%rdx \n\t" \
"xorq %%rax, %%rax \n\t" \
"cpuid \n\t" \
"pop %%rdx \n\t" \
"pop %%rax \n\t" \
"shlq $32, %%rdx \n\t" \
"orq %%rdx, %%rax \n\t" \
"movq %%rax, %0" \
: "=m" (_val) \
: "i" (_cnt) \
: "%rax", "%rbx", "%rcx", "%rdx", "memory" \
)
使用 fence :
#define RDCOUNTER(_val, _cnt) \
asm volatile \
( \
"movq %1, %%rcx \n\t" \
"mfence \n\t" \
"rdmsr \n\t" \
"mfence \n\t" \
"shlq $32, %%rdx \n\t" \
"orq %%rdx, %%rax \n\t" \
"movq %%rax, %0" \
: "=m" (_val) \
: "i" (_cnt) \
: "%rax", "%rbx", "%rcx", "%rdx", "memory" \
)
我项目的部分内容是尝试估算处理器的外部时钟频率(FSB或BCLK)。
我期望在几次运行后获得恒定的频率。
不幸的是,无论是否使用屏障指令,我仍然存在差异。
结果相当封闭,过去时间至少为3位小数,但永远不会。
(这是在Core 2和Core i7上测试的)
DECLARE_COMPLETION(bclk_job_complete);
typedef struct {
unsigned long long V[2], D;
} TSC_STRUCT;
#define OCCURENCES 32
signed int Compute_Clock(void *arg)
{
CLOCK *clock=(CLOCK *) arg;
unsigned int ratio=clock->Q;
unsigned long long overhead=0;
struct kmem_cache *hardwareCache=kmem_cache_create(
"IntelClockCache",
OCCURENCES * sizeof(TSC_STRUCT), 0,
SLAB_HWCACHE_ALIGN, NULL);
TSC_STRUCT *TSC=kmem_cache_alloc(hardwareCache, GFP_KERNEL);
unsigned int loop=0, best=0, top=0;
// No preemption, no interrupt.
unsigned long flags;
preempt_disable();
raw_local_irq_save(flags);
// Warm-up
RDCOUNTER(TSC[loop].V[0], MSR_IA32_TSC);
RDCOUNTER(TSC[loop].V[1], MSR_IA32_TSC);
// Overhead
RDCOUNTER(TSC[loop].V[0], MSR_IA32_TSC);
RDCOUNTER(TSC[loop].V[1], MSR_IA32_TSC);
overhead=TSC[loop].V[1] - TSC[loop].V[0];
// Pick-up
for(loop=0; loop < OCCURENCES; loop++)
{
RDCOUNTER(TSC[loop].V[0], MSR_IA32_TSC);
udelay(100);
RDCOUNTER(TSC[loop].V[1], MSR_IA32_TSC);
}
// Restore interrupt and preemption.
raw_local_irq_restore(flags);
preempt_enable();
for(loop=0; loop < OCCURENCES; loop++)
TSC[loop].D=TSC[loop].V[1] - TSC[loop].V[0] - overhead;
for(loop=0; loop < OCCURENCES; loop++) {
unsigned int inner=0, count=0;
for(inner=loop; inner < OCCURENCES; inner++)
if(TSC[loop].D == TSC[inner].D)
count++;
if((count > top)
||((count == top) && (TSC[loop].D < TSC[best].D))) {
top=count;
best=loop;
}
/* printk("%3u x D[%02u]=%llu\t%llu - %llu\n",
count, loop, TSC[loop].D, TSC[loop].V[1], TSC[loop].V[0]); */
}
printk("Overhead=%llu\tBest=%llu\n", overhead, TSC[best].D);
clock->Q=TSC[best].D / (ratio * PRECISION);
clock->R=TSC[best].D % (ratio * PRECISION);
kmem_cache_free(hardwareCache, TSC);
kmem_cache_destroy(hardwareCache);
complete_and_exit(&bclk_job_complete, 0);
}