Question

我编写了一个读取256KB阵列的程序，以获得1ms的延迟。该程序非常简单和附加。但是，当我在Xen上的VM上运行它时，我发现延迟不稳定。它具有以下模式：时间单位为ms。

    #totalCycle CyclePerLine  totalms
    22583885 5513 6.452539
    3474342 848 0.992669
    3208486 783 0.916710
    25848572 6310 7.385306
    3225768 787 0.921648
    3210487 783 0.917282
    25974700 6341 7.421343
    3244891 792 0.927112
    3276027 799 0.936008
    25641513 6260 7.326147
    3531084 862 1.008881
    3233687 789 0.923911
    22397733 5468 6.399352
    3523403 860 1.006687
    3586178 875 1.024622
    26094384 6370 7.455538
    3540329 864 1.011523
    3812086 930 1.089167
    25907966 6325 7.402276

我认为某些流程正在做某事，这就像是一个事件驱动的流程。有没有人遇到过这个？或者任何人都可以指出可能实现这一目标的潜在流程/服务？

以下是我的计划。我跑了1000次。每次得到上面结果的一行。

#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <string>
#include <ctime>

using namespace std;

#if defined(__i386__)
static __inline__ unsigned long long rdtsc(void)
{
    unsigned long long int x;
    __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
    return x;
}
#elif defined(__x86_64__)
static __inline__ unsigned long long rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#endif

#define CACHE_LINE_SIZE 64

#define WSS 24567 /* 24 Mb */
#define NUM_VARS WSS * 1024 / sizeof(long)

#define KHZ 3500000

// ./a.out memsize(in KB)
int main(int argc, char** argv)
{
    unsigned long wcet = atol(argv[1]);
    unsigned long mem_size_KB = 256;  // mem size in KB
    unsigned long mem_size_B  = mem_size_KB * 1024; // mem size in Byte
    unsigned long count       = mem_size_B / sizeof(long);
    unsigned long row         = mem_size_B / CACHE_LINE_SIZE;
    int           col         = CACHE_LINE_SIZE / sizeof(long);

    unsigned long long start, finish, dur1;
    unsigned long temp;

    long *buffer;
    buffer = new long[count];

    // init array
    for (unsigned long i = 0; i < count; ++i)
        buffer[i] = i;

    for (unsigned long i = row-1; i >0; --i) {
        temp = rand()%i;
        swap(buffer[i*col], buffer[temp*col]);
    }

    // warm the cache again
    temp = buffer[0];
    for (unsigned long i = 0; i < row-1; ++i) {
        temp = buffer[temp];
    }

    // First read, should be cache hit
    temp = buffer[0];
    start = rdtsc();
    int sum = 0;
    for(int wcet_i = 0; wcet_i < wcet; wcet_i++)
    {
        for(int j=0; j<21; j++)
        {
            for (unsigned long i = 0; i < row-1; ++i) {
                if (i%2 == 0) sum += buffer[temp];
                else sum -= buffer[temp];
                temp = buffer[temp];
            }
        }
    }
    finish = rdtsc();
    dur1 = finish-start;

    // Res
    printf("%lld %lld %.6f\n", dur1, dur1/row, dur1*1.0/KHZ);
    delete[] buffer;
    return 0;
}

Answer 1

在虚拟机中使用RDTSC指令很复杂。管理程序（Xen）可能通过捕获它来模拟RDTSC指令。你最快的运行显示大约800个周期/缓存线，这是非常非常缓慢的...唯一的解释是RDTSC导致由管理程序处理的陷阱，该开销是性能瓶颈。我不确定你会定期看到更长的时间，但鉴于RDTSC被困，所有的定时投注都会被关闭。

您可以在此处详细了解

http://xenbits.xen.org/docs/4.2-testing/misc/tscmode.txt

rdtsc系列中的说明是非特权的，但是特权软件可能会设置一个cpuid位来导致所有rdtsc系列陷阱的说明。 Xen可以检测到这个陷阱然后透明地模仿＆＃34; rdtsc指令和结果将控制权返回给rdtsc指令后的代码

顺便说一下，那篇文章是错误的，因为管理程序没有设置cpuid bit导致RDTSC陷阱，它是控制寄存器4中的第2位（CR4.TSD）：

http://en.wikipedia.org/wiki/Control_register#CR4

VM上奇怪的程序延迟行为

1 个答案: