CUDA并发执行问题

时间:2016-09-05 14:17:45

标签: concurrency cuda

我想创建一个基本的CUDA应用程序来演示学生的内存传输/内核执行重叠。 但是使用nvvp,似乎没有并发执行。你能帮我解决什么问题吗?

完整源代码(Visual Studio 2015,CUDA 8.0,sm3.5,arch3.5,Titan X卡):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <malloc.h>
#include <stdio.h>

#define MEMSIZE 8000000
#define STREAM_N 8

__global__ void TestKernel(char *img)
{
    int pos = blockIdx.x * blockDim.x + threadIdx.x;
    for (int k = 0; k < 100; k++)
        img[pos] = img[pos] / 2 + 128;
}

int main()
{
    // allocate memory and streams
    char *img[STREAM_N];
    char *d_img[STREAM_N];
    cudaStream_t streams[STREAM_N];

    for (int pi = 0; pi < STREAM_N; pi++)
    {
        cudaMalloc((void**)&d_img[pi], MEMSIZE / STREAM_N);
        cudaMallocHost((void**)&img[pi], MEMSIZE / STREAM_N);
        cudaStreamCreate(&streams[pi]);
    }

    // process packages one way
    cudaError_t stat;
    for (int pi = 0; pi < STREAM_N; pi++)
        cudaMemcpyAsync(d_img[pi], img[pi], MEMSIZE / STREAM_N, cudaMemcpyHostToDevice, streams[pi]);
    for (int pi = 0; pi < STREAM_N; pi++)
        TestKernel <<< MEMSIZE / STREAM_N / 400, 400, 0, streams[pi] >>>(d_img[pi]);
    for (int pi = 0; pi < STREAM_N; pi++)
        cudaMemcpyAsync(img[pi], d_img[pi], MEMSIZE / STREAM_N, cudaMemcpyDeviceToHost, streams[pi]);

    // process packages another way
    for (int pi = 0; pi < STREAM_N; pi++) 
    {
        cudaMemcpyAsync(d_img[pi], img[pi], MEMSIZE / STREAM_N, cudaMemcpyHostToDevice, streams[pi]);
        TestKernel <<< MEMSIZE / STREAM_N / 400, 400, 0, streams[pi] >>>(d_img[pi]);
        cudaMemcpyAsync(img[pi], d_img[pi], MEMSIZE / STREAM_N, cudaMemcpyDeviceToHost, streams[pi]);
    }
    cudaDeviceSynchronize();

    // destroy streams and free memory
    for (int pi = 0; pi < STREAM_N; pi++)
    {
        cudaStreamDestroy(streams[pi]);
        cudaFreeHost(img[pi]);
        cudaFree(d_img[pi]);
    }
}

视觉分析器输出:

performance analysis

1 个答案:

答案 0 :(得分:2)

WDDM命令批处理导致了这个问题。 最佳解决方案是将卡的操作模式从WDDM切换到TCC。这可以通过nvidia-smi命令完成。

nvidia-smi -i <gpu_id> -dm 1

这解决了我的问题。我希望看到的模式: timeline

另一种解决方案是使用cudaStreamQuery(source)手动刷新命令队列,如:

for (int pi = 0; pi < STREAM_N; pi++) 
    {
        cudaMemcpyAsync(d_img[pi], img[pi], MEMSIZE / STREAM_N, cudaMemcpyHostToDevice, streams[pi]);
        TestKernel <<< MEMSIZE / STREAM_N / 400, 400, 0, streams[pi] >>>(d_img[pi]);
        cudaMemcpyAsync(img[pi], d_img[pi], MEMSIZE / STREAM_N, cudaMemcpyDeviceToHost, streams[pi]);
        cudaStreamQuery(streams[pi]); // FLUSH COMMAND QUEUE
    }