Question

我正在尝试编写一个程序来测量上下文切换。关于rdtsc + rdtscp指令，我已经完成了这个Intel's manual。

现在，我想在上下文切换中使用这些时间戳指令。我的一般骨架如下：

// init two pipes P1, P2
fork();

set_affinity();              // same core

// parent's code:
    cpuid + rdtsc            // start timer
    write(timer to P1);

    read(timer from P2);     // blocks parent if timer value not written
    rdtscp + cpuid           // stop timer, get difference

// child's code:
    read(timer from P1);     // blocks child if timer value not written
    rdtscp + cpuid           // stop timer, get difference

    cpuid + rdtsc            // start timer
    write(timer to P2);

我看到这个代码有一些问题。假设计时器操作正确，

如果操作系统选择将上下文切换到某个完全不同的进程（不是子进程或父进程），则它将无效。

此代码还将包括read（）和write（）系统调用所花费的时间。

忽略这些问题，是否有效使用rdtsc + rdtscp指令？

I know writing a kernel module and disabling preemption/interrupts is a better way

Answer 1

我以前做过这个，它似乎是测量上下文切换时间的有效方法。每当做这种细粒度的定时时，调度不可预测性总会发挥作用;通常你通过测量数千次并寻找最小值，媒体或平均时间间隔等数字来处理。通过运行具有实时SCHED_FIFO优先级的两个进程，您可以减少调度问题。如果您想知道实际切换时间（在单个cpu核心上），则需要将两个进程绑定到具有相关性设置的单个cpu。如果你只是想知道一个进程能够响应另一个进程的延迟，那么让它们在不同的cpu上运行就可以了。

要记住的另一个问题是自愿和非自愿的上下文切换，以及从用户空间到内核空间的切换具有不同的成本。你的可能是自愿的。测量非自愿性比较困难，需要在繁忙的循环或类似的东西中共享内存。

Answer 2

我使用了类似的计时代码，除了我有1000000次父循环，并在父和子的整个循环中计时。代码附后。然后我将它修改为单个上下文切换的时间，就像你的伪代码一样，将1000000个单独的时间相加，并与我的原始代码达成了很好的一致。因此，考虑到已经提到的警告，无论哪种方式似乎都有效。

我觉得有趣的是，当sched_setaffinity()用于将父级和子级设置为在单独的cpus上运行时，上下文切换时间会增加一倍以上。为什么这会以这种方式影响时间？在同一个cpu上运行的进程之间的管道是否更快？

rdtscp.h：

static inline unsigned long rdtscp_start(void) {
  unsigned long var;
  unsigned int hi, lo;

  __asm volatile ("cpuid\n\t"
          "rdtsc\n\t" : "=a" (lo), "=d" (hi)
          :: "%rbx", "%rcx");

  var = ((unsigned long)hi << 32) | lo;
  return (var);
}

static inline unsigned long rdtscp_end(void) {
  unsigned long var;
  unsigned int hi, lo;

  __asm volatile ("rdtscp\n\t"
          "mov %%edx, %1\n\t"
          "mov %%eax, %0\n\t"
          "cpuid\n\t"  : "=r" (lo), "=r" (hi)
          :: "%rax", "%rbx", "%rcx", "%rdx");

  var = ((unsigned long)hi << 32) | lo;
  return (var);
  }

/*see https://www.intel.com/content/www/us/en/embedded/training/ia-32-ia-64-benchmark-code-execution-paper.html
 */

cntxtSwtchr.c：

#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "rdtscp.h"

int main() {
  int pipe1[2], pipe2[2];
  pipe(pipe1) || pipe(pipe2);
  cpu_set_t set;
  CPU_ZERO(&set);

  clock_t tick, tock;

  int fork_rtn;
  if ((fork_rtn = fork()) < 0)
    exit(1);

  if (fork_rtn == 0) {  // Child
    close(pipe1[1]);
    close(pipe2[0]);

    CPU_SET(1, &set);
    sched_setaffinity(0, sizeof(set), &set);

    tick = clock();
    unsigned long tsc_start = rdtscp_start();
    int i;
    while (read(pipe1[0], &i, 4)) 
      write(pipe2[1], &i, 4);
    printf("child tsc_ticks: %lu\n", rdtscp_end() - tsc_start);
    tock = clock();
    clock_t ticks = tock - tick;
    double dt = (double)ticks / CLOCKS_PER_SEC;
    printf("Elapsed child cpu time: %gs.\n", dt); 

    close(pipe1[0]);
    close(pipe2[1]);
    exit(0);

  } else {              // Parent
    close(pipe1[0]);
    close(pipe2[1]);

    CPU_SET(1, &set);
    sched_setaffinity(0, sizeof(set), &set);

    int idx, lim = 1000000;
    int i_rtnd;
    tick = clock();
    unsigned long tsc_start = rdtscp_start();
    for (idx = 0; idx < lim; ++idx) {
      write(pipe1[1], &idx, 4);
      read(pipe2[0], &i_rtnd, 4);
      if (i_rtnd != idx) 
    break;
    }
    printf("parent tsc_ticks: %lu\n", rdtscp_end() - tsc_start);
    tock = clock();
    clock_t ticks = tock - tick;
    double dt = (double)ticks / CLOCKS_PER_SEC;
    printf("Elapsed parent cpu time: %gs, %gs/switch.\n", dt, dt / lim); 
    if (idx == lim)
      printf("Parent reached end of processing loop.\n");
    else
      printf("Parent failed to reach end of processing loop.\n");

    close(pipe1[1]);
    close(pipe2[0]);
    exit(0);
  }

}

在上下文切换

2 个答案: