Question

是否有一种方法可以使CUDA的nvprof在其统计分析器中包含诸如malloc之类的函数调用？

我一直在尝试提高应用程序的性能。自然，我一直在使用nvprof作为工具。

最近，为了减少我的应用程序的GPU内存占用量，我编写了使运行时间延长两倍的代码。但是，导致速度变慢的新代码仅少量出现在事件探查器中（指令抽样表明，新代码中花费了大约10％的时间，但是幼稚的想法将表明50 ％的时间应该已经花在了新代码中）。也许新代码导致了更多的高速缓存崩溃，也许将实现放入头文件中，从而使内联代码与分析器混淆，等等。但是，出于没有充分的理由，我怀疑新代码对malloc的调用。

实际上，在减少malloc调用次数之后，我的性能有所提高，几乎恢复到合并新代码之前的水平。

这使我想到了一个类似的问题，即为什么malloc的调用未显示在统计分析器中？无法观察到的系统调用？

下面，我将提供一个示例程序及其示例，以展示此特定问题。

malloc

结果的省略版本是：

#include <iostream>
#include <numeric>
#include <thread>
#include <stdlib.h>
#include <stdio.h>

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

__global__ void countup()
{
    long sum = 0;
    for (long i = 0; i < (1 << 23); ++i) {
        sum += i;
    }
    printf("sum is %li\n", sum);
}

__global__ void malloc_a_lot() {
    long sum = 0;
    for (int i = 0; i < (1 << 17) * 3; ++i) {
        int * v = (int *) malloc(sizeof(int));
        sum += (long) v;
        free(v);
    }
    printf("sum is %li\n", sum);
}

__global__ void both() {
    long sum = 0;
    for (long i = 0; i < (1 << 23); ++i) {
        sum += i;
    }
    printf("sum is %li\n", sum);

    sum = 0;
    for (int i = 0; i < (1 << 17) * 3; ++i) {
        int * v = (int *) malloc(sizeof(int));
        sum += (long) v;
        free(v);
    }
    printf("sum is %li\n", sum);
}


int main(void)
{

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    std::chrono::time_point<std::chrono::system_clock> t1 = std::chrono::system_clock::now();
    countup<<<8,1>>>();
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    std::chrono::time_point<std::chrono::system_clock> t2 = std::chrono::system_clock::now();
    malloc_a_lot<<<8,1>>>();
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    std::chrono::time_point<std::chrono::system_clock> t3 = std::chrono::system_clock::now();
    both<<<8,1>>>();
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    std::chrono::time_point<std::chrono::system_clock> t4 = std::chrono::system_clock::now();

    std::chrono::duration<double> duration_1_to_2 = t2 - t1;
    std::chrono::duration<double> duration_2_to_3 = t3 - t2;
    std::chrono::duration<double> duration_3_to_4 = t4 - t3;
    printf("timer for countup() took %.3lf\n", duration_1_to_2.count());
    printf("timer for malloc_a_lot() took %.3lf\n", duration_2_to_3.count());
    printf("timer for both() took %.3lf\n", duration_3_to_4.count());

    return 0;
}

static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}

分析结果如下图所示。将鼠标悬停在浅蓝色条上时显示的数字与条的大小一致。具体而言，第41行具有与之关联的16,515,077个样本，但第47行仅具有633,996个样本。

顺便说一句，上面的程序是使用调试信息编译的，并且可能没有进行优化-Nsight Eclipse中的默认“调试”模式。如果我以“发布”模式进行编译，则会调用优化，并且sum is 35184367894528... sum is -319453208467532096... sum is 35184367894528... sum is -319453208467332416... timer for countup() took 4.034 timer for malloc_a_lot() took 4.306 timer for both() took 8.343调用的持续时间非常接近0秒。

Answer 1

当前的NVIDIA GPU PC采样器仅收集当前的扭曲程序计数器（而不是调用堆栈）。 PC采样器将正确收集malloc内部的样本；但是，该工具不会显示内部系统调用的SASS或高级源。

该工具没有用于显示syscall模块中样本总数的UI。
该工具不知道malloc，free或其他syscall的PC范围，无法将样本正确地归因于名为syscall的用户。

如果（1）或（2）是固定的，则数据将显示在单独的行中，并简单地标记为“ syscall”或“ malloc”。硬件不收集调用堆栈，因此无法将样本归因于L48。

如何使malloc出现在nvprof的统计分析器中？

1 个答案: