Question

我编写了两个程序来检查clflush是否从缓存中驱逐我的数据。在我编写的两个程序中，只有一个程序给出了正确的结果（根据我的预期，在clflush之后，访问时间必须高于刷新之前）。

这是我的Program1，我得到了预期的结果。

#include <stdio.h>
#include <stdint.h>

inline void clflush(volatile void *p)
{
    asm volatile ("clflush (%0)" :: "r"(p));
}

inline uint64_t rdtsc()
{
    unsigned long a, d;
    asm volatile ("cpuid; rdtsc" : "=a" (a), "=d" (d) : : "ebx", "ecx"); 
    return a | ((uint64_t)d << 32);
}


static int i=10; // static variable 

inline void test()
{
    uint64_t start, end;    
    int j;    
    start = rdtsc();
    j = i;
    end = rdtsc();
    printf("took %lu ticks\n", end - start);
}

int main(int ac, char **av)
{
    test();
    test();
    printf("flush: ");
    clflush((void *)&i);
    test();
    test();
    return 0;
}

这是我的输出（如预期的那样）

took 314 ticks
took 282 ticks
flush: took 442 ticks
took 272 ticks

这是另一个程序，其中我没有得到预期的结果。

 #include <stdio.h>
 #include <stdint.h>

inline void clflush(volatile void *p)
{
    asm volatile ("clflush (%0)" :: "r"(p));
}

inline uint64_t rdtsc()
{
    unsigned long a, d;
    asm volatile ("cpuid; rdtsc" : "=a" (a), "=d" (d) : : "ebx", "ecx"); 
    return a | ((uint64_t)d << 32);
}


static const int i=10; // I make this as constant 

inline void test()
{
    uint64_t start, end;    
    int j;    
    start = rdtsc();
    j = i;
    end = rdtsc();
    printf("took %lu ticks\n", end - start);
}

int main(int ac, char **av)
{
    test();
    test();
    printf("flush: ");
    clflush((void *)&i);
    test();
    test();
    return 0;
}

这是我的输出（如预期的那样）

took 314 ticks
took 282 ticks
flush: took 282 ticks // same as previous
took 272 ticks


--------
took 314 ticks
took 282 ticks
flush: took 272 ticks // lower than previous
took 272 ticks

如果我使静态int i = 10; into static const int i = 10;然后结果不符合我的期望。 clflush后我得到的值越来越低/访问时间越来越短。

任何人都可以解释为什么会这样吗？我如何按照我的期望（在clflush之后作为program1获得更高的访问时间）来制作它（在C或C ++中）？

我在Fedora19 linux下使用GCC。任何帮助将受到高度赞赏。

Answer 1

我很确定这里的问题是CPUID + RDTSC与“之间的指令”相比太长了。

我得到了非常不同的结果，大概取决于“运气”代码最终运行的实际CPU，其他CPU正在做什么等等。

这是第二个程序连续三次运行：

took 92 ticks
took 75 ticks
flush: took 75 ticks
took 474 ticks

took 221 ticks
took 243 ticks
flush: took 221 ticks
took 242 ticks

took 221 ticks
took 221 ticks
flush: took 221 ticks
took 230 ticks

但是，我认为我们不能从中得出“clflush不起作用”的结论。只是在处理器中有足够的时钟周期和足够的乱序执行来克服缓存 - 刷新并重新加载数据。

如果您有大量数据，比如几千字节，您可能会得到更明显的效果。我会做一些实验，但现在我需要一些食物......

#include <stdio.h>
#include <stdint.h>

inline void clflush(volatile void *p)
{
    __asm volatile ("clflush (%0)" :: "r"(p));
}

inline uint64_t rdtsc()
{
    unsigned long a, d;
    __asm volatile ("rdtsc" : "=a" (a), "=d" (d) : : "ebx", "ecx"); 
    return a | ((uint64_t)d << 32);
}


static int v[1024]; 
uint64_t   t[5];
int        r[5];
int        ti = 0;

static inline void test()
{
    uint64_t start, end;    
    int j;    
    start = rdtsc();
    for(int i = 0; i < 1024; i++)
    {
    j += v[i];
    }
    end = rdtsc();
    r[ti] = j;
    t[ti++] = end - start;
}

int main(int ac, char **av)
{
    for(int i = 0; i < 1024; i++)
    {
    v[i] = i;
    }
    test();
    test();
    t[ti++] = 0;
    for(int i = 0; i < 1024; i+=4)
    {
    clflush((void *)&v[i]);
    }
    test();
    test();
    for(int i = 0; i < ti; i++)
    {
    if (t[i] == 0)
    {
        printf("flush\n");
    }
    else
    {
        printf("Test %lu [res=%d]\n", t[i], r[i]);
    }
    }
    printf("\n");
    return 0;
}

我将printf移出测试路径，以减少在那里花费的时间，并使刷新的区域更大。这提供了更长的运行时间，这肯定有助于测量。

Test 2538 [res=523776]
Test 2593 [res=523776]
flush
Test 4845 [res=523776]
Test 2592 [res=523776]

Test 2550 [res=523776]
Test 2771 [res=523776]
flush
Test 4782 [res=523776]
Test 2513 [res=523776]

Test 2550 [res=523776]
Test 2708 [res=523776]
flush
Test 4356 [res=523776]
Test 2593 [res=523776]

正如您所看到的，在刷新之后，与第一次访问相比，数据大约是获取时间的两倍。

编辑：

使用const，就像这样

static const int v[1024] = 
{
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
    /* snip 62 lines equal to this */
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
};

给出了这个结果：

Test 14139 [res=8704]
Test 2639 [res=8704]
flush
Test 5287 [res=8704]
Test 2597 [res=8704]

Test 12983 [res=8704]
Test 2652 [res=8704]
flush
Test 4859 [res=8704]
Test 2550 [res=8704]

Test 12911 [res=8704]
Test 2581 [res=8704]
flush
Test 4705 [res=8704]
Test 2649 [res=8704]

如您所见，第三次访问明显慢于第二次和第四次访问。第一次访问速度较慢，因为在第一次访问时（包括页表等），缓存中根本没有任何内容。

clflush在i7中没有给出const数据类型的正确答案

1 个答案: