Question

我有一个简单的程序如下。当我在没有任何优化的情况下编译代码时，需要5.986s（用户3.677s，系统1.716s）才能在具有2.4G i5处理器和16GB DDR3-1600 9 CAS内存的mac上运行。我试图找出在这个程序中发生了多少次L1缓存未命中。有什么建议？谢谢！

void main()
{
    int size = 1024 * 1024 * 1024;
    int * a = new int[size];

    int i;
    for (i = 0; i < size; i++) a[i] = i;
    delete[] a;
}

Answer 1

您可以使用valgrind的cachegrind功能来衡量缓存未命中数。 This page提供了非常详细的摘要。

注意：如果你正在使用C，那么你应该使用malloc。别忘了给free打电话：因为你的程序会泄漏内存。如果您使用的是C ++（此问题标记错误），则应使用new和delete。

Answer 2

如果要对缓存未命中进行非常精细的粒度测量，则应使用英特尔的架构计数器，可以使用rdpmc指令从用户空间访问。我在this answer中编写的内核模块源代码将在旧用户空间的用户空间中启用rdpmc。

这是另一个内核模块，用于配置计数器，用于测量最后一级缓存未命中和最后一级缓存引用。请注意，我有硬编码的8内核，因为这是我用于配置的原因。

#include <linux/module.h>   /* Needed by all modules */
#include <linux/kernel.h>   /* Needed for KERN_INFO */

#define PERFEVTSELx_MSR_BASE   0x00000186
#define PMCx_MSR_BASE          0x000000c1      /* NB: write when evt disabled*/

#define PERFEVTSELx_USR        (1U << 16)      /* count in rings 1, 2, or 3 */
#define PERFEVTSELx_OS         (1U << 17)      /* count in ring 0 */
#define PERFEVTSELx_EN         (1U << 22)      /* enable counter */

static void
write_msr(uint32_t msr, uint64_t val)
{
    uint32_t lo = val & 0xffffffff;
    uint32_t hi = val >> 32;
    __asm __volatile("wrmsr" : : "c" (msr), "a" (lo), "d" (hi));
}

static uint64_t
read_msr(uint32_t msr)
{
    uint32_t hi, lo;
    __asm __volatile("rdmsr" : "=d" (hi), "=a" (lo) : "c" (msr));
    return ((uint64_t) lo) | (((uint64_t) hi) << 32);
}

static uint64_t old_value_perfsel0[8];
static uint64_t old_value_perfsel1[8];

static spinlock_t mr_lock = SPIN_LOCK_UNLOCKED;
static unsigned long flags;

static void wrapper(void* ptr) {
    int id;
    uint64_t value;

    spin_lock_irqsave(&mr_lock, flags);

    id = smp_processor_id();

    // Save the old values before we do something stupid.
    old_value_perfsel0[id] = read_msr(PERFEVTSELx_MSR_BASE);
    old_value_perfsel1[id] = read_msr(PERFEVTSELx_MSR_BASE+1);


    // Clear out the existing counters
    write_msr(PERFEVTSELx_MSR_BASE, 0);
    write_msr(PERFEVTSELx_MSR_BASE + 1, 0);
    write_msr(PMCx_MSR_BASE, 0);
    write_msr(PMCx_MSR_BASE + 1, 0);
    if (clear){
        spin_unlock_irqrestore(&mr_lock, flags);
        return;
    }

    // Table 19-1 in the most recent Intel Manual - Architectural 
    // Last Level Cache References  Event select 2EH, Umask 4FH
    value = 0x2E | (0x4F << 8) |PERFEVTSELx_EN |PERFEVTSELx_OS|PERFEVTSELx_USR;
    write_msr(PERFEVTSELx_MSR_BASE, value);

    // Table 19-1 in the most recent Intel Manual - Architectural 
    // Last Level Cache Misses Event select 2EH, Umask 41H
    value = 0x2E | (0x41 << 8) |PERFEVTSELx_EN |PERFEVTSELx_OS|PERFEVTSELx_USR;
    write_msr(PERFEVTSELx_MSR_BASE + 1, value);


    spin_unlock_irqrestore(&mr_lock, flags);
}

static void restore_wrapper(void* ptr) {
    int id = smp_processor_id();
    if (clear) return;
    write_msr(PERFEVTSELx_MSR_BASE, old_value_perfsel0[id]);
    write_msr(PERFEVTSELx_MSR_BASE+1, old_value_perfsel1[id]);
}

int init_module(void)
{
    printk(KERN_INFO "Entering write-msr!\n");
    on_each_cpu(wrapper, NULL, 0);
    /* 
     * A non 0 return means init_module failed; module can't be loaded. 
     */
    return 0;
}

void cleanup_module(void)
{
    on_each_cpu(restore_wrapper, NULL, 0);
    printk(KERN_INFO "Exiting write-msr!\n");
}

这是围绕rdpmc的用户空间包装器。

uint64_t
read_pmc(int ecx)
{
    unsigned int a, d;
    __asm __volatile("rdpmc" : "=a"(a), "=d"(d) : "c"(ecx));
    return ((uint64_t)a) | (((uint64_t)d) << 32);
}

Answer 3

您必须在64位系统上运行。您将4 GB数据设置为零。高速缓存未命中数为4 x 1024 x 1024 x 1024，除以高速缓存行大小。但是，由于所有内存访问都是顺序的，因此您不会有很多TLB未命中等，并且处理器最有可能优化对顺序缓存行的访问。

Answer 4

你在这里（或缺少）的表现完全由分页主导，每个新页面（在你的情况下可能是4k）会导致页面错误，因为它是新分配的并且从未使用过，并触发昂贵的操作系统流。 Cachegrind和性能监视器应该向您显示相同的行为，因此如果您只希望进行简单的数据访问，可能会感到困惑。

避免这种情况的一种方法是分配，存储一次到整个数组（甚至每页一次）以预热页表，然后在应用程序内部测量时间（使用rdtsc或任何c您喜欢的API在主循环上。
或者，如果您想使用外部时间测量，只需多次循环（> 1000）并摊销，因此初始惩罚将变得不那么重要。

一旦你完成所有这些，你测量的缓存未命中应该反映每个新的64字节行（即~16M）的访问次数，加上页面遍历（256k页假设它们是4k，乘以页表级别，因为每个步骤都必须在每个级别查找内存）在虚拟化平台下，寻呼将变为平方（例如，9次访问而不是3次访问），因为访客页面表的每个级别也需要在主机上进行寻呼。

这个简单的程序有多少缓存未命中？

4 个答案: