Question

我正在编写一个代码，通过将代码作为模块加载到内核中来测量内核中代码序列的时间消耗。我使用常见的rdtsc例程来计算时间。有趣的是在用户模式下类似的例程运行导致正常值，而在内核模式下运行时结果始终为0，无论我在time_count函数中添加了多少行代码。我在这里使用的计算是一个共同的矩阵乘积函数，并且运行周期应该通过矩阵维数的增加而迅速增加。任何人都可以指出我的代码中的错误，为什么我无法测量内核中的循环次数？

#include <linux/init.h>
#include <linux/module.h>

int matrix_product(){
  int array1[500][500], array2[500][500], array3[500][500];
  int i, j, k, sum;

  for(i = 0; i < 50000; i++){
    for(j = 0; j < 50000; j++){
      array1[i][j] = 5*i + j;
      array2[i][j] = 5*i + j;
    }
  }

  for(i = 0; i < 50000; i++){
    for(j = 0; j < 50000; j++){
      for(k = 0; k < 50000; k++)
    sum += array1[i][k]*array2[k][j];
      array3[i][j] = sum;
      sum = 0;
    }
  }
  return 0;
}

static __inline__ unsigned long long rdtsc(void)
{
 unsigned long hi, lo;
 __asm__ __volatile__ ("xorl %%eax,%%eax\ncpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
 __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
 return ((unsigned long long)lo) | (((unsigned long long)hi)<<32) ;
}

static int my_init(void)
{
  unsigned long str, end, curr, best, tsc, best_curr;
  long i, t;

#define time_count(codes) for(i=0; i<120000; i++){str=rdtsc(); codes; end=rdtsc(); curr=end-str; if(curr<best)best=curr;}

 best = ~0;
 time_count();
 tsc = best;

 best = ~0;
 time_count(matrix_product());
 best_curr = best;
 printk("<0>matrix product: %lu ticks\n", best_curr-tsc);

 return 0;
}

static void my_exit(void){
  return;
}

module_init(my_init);
module_exit(my_exit);`

任何帮助表示赞赏！感谢。

Answer 1

rdtsc不保证在每个CPU上都可用，或者以恒定速率运行，或者在不同核心之间保持一致。

除非您对时间戳有特殊要求，否则您应使用getrawmonotonic等可靠且可移植的功能。

如果你真的想直接使用循环，那么内核已经为此定义了get_cycles和cpuid函数。

使用内核中的rdtsc测量执行时间

1 个答案: