我想创建一个基本的CUDA应用程序来演示学生的内存传输/内核执行重叠。 但是使用nvvp,似乎没有并发执行。你能帮我解决什么问题吗?
完整源代码(Visual Studio 2015,CUDA 8.0,sm3.5,arch3.5,Titan X卡):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <malloc.h>
#include <stdio.h>
#define MEMSIZE 8000000
#define STREAM_N 8
__global__ void TestKernel(char *img)
{
int pos = blockIdx.x * blockDim.x + threadIdx.x;
for (int k = 0; k < 100; k++)
img[pos] = img[pos] / 2 + 128;
}
int main()
{
// allocate memory and streams
char *img[STREAM_N];
char *d_img[STREAM_N];
cudaStream_t streams[STREAM_N];
for (int pi = 0; pi < STREAM_N; pi++)
{
cudaMalloc((void**)&d_img[pi], MEMSIZE / STREAM_N);
cudaMallocHost((void**)&img[pi], MEMSIZE / STREAM_N);
cudaStreamCreate(&streams[pi]);
}
// process packages one way
cudaError_t stat;
for (int pi = 0; pi < STREAM_N; pi++)
cudaMemcpyAsync(d_img[pi], img[pi], MEMSIZE / STREAM_N, cudaMemcpyHostToDevice, streams[pi]);
for (int pi = 0; pi < STREAM_N; pi++)
TestKernel <<< MEMSIZE / STREAM_N / 400, 400, 0, streams[pi] >>>(d_img[pi]);
for (int pi = 0; pi < STREAM_N; pi++)
cudaMemcpyAsync(img[pi], d_img[pi], MEMSIZE / STREAM_N, cudaMemcpyDeviceToHost, streams[pi]);
// process packages another way
for (int pi = 0; pi < STREAM_N; pi++)
{
cudaMemcpyAsync(d_img[pi], img[pi], MEMSIZE / STREAM_N, cudaMemcpyHostToDevice, streams[pi]);
TestKernel <<< MEMSIZE / STREAM_N / 400, 400, 0, streams[pi] >>>(d_img[pi]);
cudaMemcpyAsync(img[pi], d_img[pi], MEMSIZE / STREAM_N, cudaMemcpyDeviceToHost, streams[pi]);
}
cudaDeviceSynchronize();
// destroy streams and free memory
for (int pi = 0; pi < STREAM_N; pi++)
{
cudaStreamDestroy(streams[pi]);
cudaFreeHost(img[pi]);
cudaFree(d_img[pi]);
}
}
视觉分析器输出:
答案 0 :(得分:2)
WDDM命令批处理导致了这个问题。 最佳解决方案是将卡的操作模式从WDDM切换到TCC。这可以通过nvidia-smi命令完成。
nvidia-smi -i <gpu_id> -dm 1
这解决了我的问题。我希望看到的模式: timeline
另一种解决方案是使用cudaStreamQuery(source)手动刷新命令队列,如:
for (int pi = 0; pi < STREAM_N; pi++)
{
cudaMemcpyAsync(d_img[pi], img[pi], MEMSIZE / STREAM_N, cudaMemcpyHostToDevice, streams[pi]);
TestKernel <<< MEMSIZE / STREAM_N / 400, 400, 0, streams[pi] >>>(d_img[pi]);
cudaMemcpyAsync(img[pi], d_img[pi], MEMSIZE / STREAM_N, cudaMemcpyDeviceToHost, streams[pi]);
cudaStreamQuery(streams[pi]); // FLUSH COMMAND QUEUE
}