cuda内核不能并发执行

时间:2013-01-02 15:34:47

标签: cuda

我正在尝试探索具有2.0功能的Nvidia Quadro 4000的并发内核执行属性。

我使用2个不同的流,运行方式如下:

  1. 复制H2D两个不同的固定内存块
  2. 运行内核
  3. 将D2H两个不同的块复制到固定内存中。
  4. 两个流的内核完全相同,每个内核的执行时间为190毫秒。

    在Visual Profiler(版本5.0)中,我希望两个内核同时开始执行,但它们只重叠20 ms。 这是代码示例:

    enter code here
    
    //initiate the streams
            cudaStream_t stream0,stream1;
            CHK_ERR(cudaStreamCreate(&stream0));
            CHK_ERR(cudaStreamCreate(&stream1));
            //allocate the memory on the GPU for stream0
            CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
            //allocate the memory on the GPU for stream1
            CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
            CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));
    
            //allocate page-locked memory for stream0
            CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
    
            //allocate page-locked memory for stream1
            CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
            CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
    
    
            memcpy(host01,in1,width*height*sizeof(char));
            memcpy(host02,in2,width*height*sizeof(char));
    
            memcpy(host11,in1,width*height*sizeof(char));
            memcpy(host12,in2,width*height*sizeof(char));
    
    
    
            cudaEvent_t start, stop;
            float time;
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
    
            dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
            dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);
    
            cudaEventRecord(start,0);
            // --------------------
            // Copy images to device
            // --------------------
            //enqueue copies of def stream0 and stream1
            CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
            CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
            //enqueue copies of ref stream0 and stream1
            CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
            CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
    
            CHK_ERR(cudaStreamSynchronize(stream0));
            CHK_ERR(cudaStreamSynchronize(stream1));
    
            //CALLING KERNEL
            //enqueue kernel in stream0 and stream1
            TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
            TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream1>>>(def_img1+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img1,outX_img1,outY_img1,width,width_size_for_out)),"exhaustiveSearchKernel stream1");
    
    
            //Copy result back
            CHK_ERR(cudaMemcpyAsync(host03, outX_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
            CHK_ERR(cudaMemcpyAsync(host13, outX_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));
    
            CHK_ERR(cudaMemcpyAsync(host04, outY_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
            CHK_ERR(cudaMemcpyAsync(host14, outY_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));
    
    
            CHK_ERR(cudaStreamSynchronize(stream0));
            CHK_ERR(cudaStreamSynchronize(stream1));
    
            cudaEventRecord( stop, 0 );
            cudaEventSynchronize( stop );
            cudaEventElapsedTime( &time, start, stop );
            printf("Elapsed time=%f ms\n",time);
    
            memcpy(outX,host03,width_size_for_out*height_size_for_out*sizeof(char));
            memcpy(outY,host04,width_size_for_out*height_size_for_out*sizeof(char));
    
    
            cudaEventDestroy( start ); 
            cudaEventDestroy( stop );
            CHK_ERR(cudaStreamDestroy(stream0));
            CHK_ERR(cudaStreamDestroy(stream1));
    
            CHK_ERR(cudaDeviceReset());
    
    
        } 
    

1 个答案:

答案 0 :(得分:3)

计算能力2.x-3.0

计算能力2.x-3.0设备具有单个硬件工作队列。 CUDA驱动程序将命令推送到工作队列中。 GPU主机读取命令并将工作分派给复制引擎或CUDA工作分配器(CWD)。 CUDA驱动程序将同步命令插入到硬件工作队列中,以保证同一流上的工作不能同时运行。当主机命中同步命令时,它将停止,直到相关工作完成。

当网格太小而无法填充整个GPU或网格具有尾部效应(线程块的子集执行时间比其他线程块长得多)时,并发内核执行可提高GPU利用率。

案例1:在一个流上背靠背内核

如果应用程序在同一个流上重新启动两个kernesl,那么CUDA驱动程序插入的同步命令将不会将第二个内核调度到CWD,直到第一个内核完成。

案例2:在两个流上启动背靠背内核

如果应用程序在不同的流上启动两个内核,主机将读取命令并将命令分派给CWD。 CWD将栅格化第一个网格(顺序依赖于体系结构)并将线程块分派给SM。只有当调度了第一个网格中的所有线程块时,CWD才会从第二个网格中调度线程块。

计算能力3.5

计算能力3.5引入了几项新功能以提高GPU利用率。这些包括: - HyperQ支持多个独立的硬件工作队列。 - 动态并行性允许设备代码启动新工作。 - CWD容量增加到32个网格。

<强>资源