我正在尝试使用nvprof工具分析CUDA程序。
这是我的代码:
#include <iostream>
#include <math.h>
#include <cuda_profiler_api.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<10;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
cudaProfilerStop();
cudaDeviceReset();
return 0;
}
我使用命令nvcc add.cu -o add_cuda对其进行了编译。
然后我使用nvprof ./add_cuda --unified-memory-profiling off(root)运行它 或nvprof并获得以下输出:
==15318== NVPROF is profiling process 15318, command: ./add_cuda
Max error: 0
==15318== Profiling application: ./add_cuda
==15318== Profiling result:
No kernels were profiled.
No API activities were profiled.
==15318== Warning: Some profiling data are not recorded. Make sure cudaProfilerStop() or cuProfilerStop() is called before application exit to flush profile data.
======== Error: Application received signal 139
我在线搜索了一个解决方案nvprof not picking up any API calls or kernels,https://devtalk.nvidia.com/default/topic/1010691/visual-profiler/nvprof-error-code-139-but-memcheck-ok/, 但实际上没有任何帮助。
我如何使nvprof工作?
谢谢!
Fedora 29 64位
nvprof: NVIDIA (R) Cuda command line profiler
Copyright (c) 2012 - 2019 NVIDIA Corporation
Release version 10.1.168 (21)
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Apr_24_19:10:27_PDT_2019
Cuda compilation tools, release 10.1, V10.1.168
nvidia-smi
Mon Jul 1 13:24:54 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26 Driver Version: 430.26 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 108... Off | 00000000:03:00.0 On | N/A |
| 0% 37C P8 20W / 250W | 253MiB / 11175MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 1741 G /usr/libexec/Xorg 154MiB |
| 0 2161 G cinnamon 96MiB |
+-----------------------------------------------------------------------------+
答案 0 :(得分:1)
使用nvprof版本 10.1 时,我得到相同的结果,但它与 10.0 一起使用。
尝试下载实例CUDA 10.0,并改用该版本的nvprof。即使使用nvcc版本10.1编译,这也对我有用。