第二个内核完成时,cudaEventRecord和重叠操作的位置

时间:2012-08-20 22:22:48

标签: cuda gpu nvidia

我有两项任务。它们都执行复制到设备(D),并运行内核(R)操作。任务具有不同的内核运行时。 R1比R2完成5倍(R1 = ~17 ms,R2 = ~3.5 ms)。任务的内核执行等待操作,我允许这些内核同时运行。每个副本操作需要7毫秒。

我有GeForce GT 555M,CUDA 4.1和Fedora 16。

我使用cudaEventRecord记录每个任务的D和R操作的开始和停止时间。我为每个任务使用两个流。我使用cudaStreamWaitEvents来同步这两个任务流。一个流用于任务的D op,另一个用于任务的R op。我的目标是将D2与R1重叠。我测量task1和task2的总体时间,以确定是否实现了这种重叠。

我有两个场景。在Scenerio1中,“start R1”放在内核之前,“start R2”放在内核之间。在Scenerio2中,“start R1”和“start R2”都放在内核之前。

对于下面给出的伪代码,Scenario1和Scenerio2的行为不相同:虽然Scenerio2无法与R1重叠D2,但Scenerio1成功完成了!所以我的问题是:要与D2重叠D2,为什么我们必须在内核之间(而不是之前)放置“启动R2”,当R2比R1短?(请注意,我有还测试了R1比R2短的情况。在这种情况下,在内核之前或之间放置“启动R2”没有区别,在这两种情况下,我们可以将D2与R1重叠。在D2完成后,我们可以同时运行R1和R2。)

以下是Scenario1和2的伪代码(我对task1使用stream1和stream3,对task2使用stream2和stream4):

场景1(SUCCEEDS):

start overall

start D1 on stream1
D1 on stream1
stop D1 on stream1

start D2 on stream2
D2 on stream2
stop D2 on stream2

start R1 on stream3

R1 on stream3 //longer

start R2 on stream4 // start R2 is in between kernels

R2 on stream4 //shorter

stop R2 on stream4
stop R1 on stream3

stop overall

场景2(FAILS):

start overall

start D1 on stream1
D1 on stream1
stop D1 on stream1

start D2 on stream2
D2 on stream2
stop D2 on stream2

start R1 on stream3

start R2 on stream4 // start R2 is before kernels

R1 on stream3 //longer

R2 on stream4 //shorter

stop R2 on stream4
stop R1 on stream3

stop overall 

方案的总体时间如下:

场景1 = 24.109312

Scenario2 = 31.194496

这些场景的预期总体运行时间是D1 + R1 = 7 + 17 = 24(我们可以与R1重叠D2,同时运行R1和R2,同时)。尽管Scenario1成功实现了此运行时,但Scenerio2未能这样做。这是因为Scenario2不能与R1重叠D2。 (D2需要7毫秒,这就是Scenario2运行时为24 + 7 = 31的原因)。

我还附上了下面的CUDA代码:

#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/time.h>

__global__ void wait_k(long time_clocks)
{ 
    long start_clock = clock();

    long clock_offset = 0;

    while( clock_offset < time_clocks) {
        clock_offset = clock() - start_clock;
    }
}


void shorterR2_D2_R1_Overlap()
{
float *h_A;
float *d_A, *d_C;
float *h_A2;
float *d_A2, *d_C2;


int N = 10000000;
size_t size = N * sizeof(float); 

cudaMallocHost((void**) &h_A, size);
cudaMallocHost((void**) &h_A2, size);

// Allocate vector in device memory
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_C, size);
cudaMalloc((void**)&d_A2, size);
cudaMalloc((void**)&d_C2, size);


for (int i = 0; i<N; ++i)
{
h_A[i] = 1;
h_A2[i] = 5;
}

cudaStream_t csStream1, csStream2, csStream3, csStream4;

cudaStreamCreate(&csStream1);
cudaStreamCreate(&csStream2);
cudaStreamCreate(&csStream3);
cudaStreamCreate(&csStream4);

//allocate vars for dummy copy 
float* h_pfDummy;
float* d_pfDummy;
size_t iMemSz = 10 * sizeof(float);
cudaMallocHost((void**) &h_pfDummy, iMemSz);
cudaMalloc((void**)&d_pfDummy, iMemSz);

cudaMemcpyAsync(d_pfDummy, h_pfDummy, iMemSz, cudaMemcpyHostToDevice, csStream1);
cudaMemcpyAsync(d_pfDummy, h_pfDummy, iMemSz, cudaMemcpyHostToDevice, csStream2);

//delete vars of dummy copy 
cudaFree(d_pfDummy);
cudaFreeHost(h_pfDummy);

long time_clocks = 20000000; 
long div = 5;

cudaEvent_t ceEvStart, ceEvStop; 
cudaEventCreate( &ceEvStart );
cudaEventCreate( &ceEvStop );

//diff stream time events
cudaEvent_t ceEvStartCpyDev1, ceEvStopCpyDev1, ceEvStartKer1, ceEvStopKer1;
cudaEventCreate( &ceEvStartCpyDev1 );
cudaEventCreate( &ceEvStopCpyDev1 );
cudaEventCreate( &ceEvStartKer1 );
cudaEventCreate( &ceEvStopKer1 );
cudaEvent_t ceEvStartCpyDev2, ceEvStopCpyDev2, ceEvStartKer2, ceEvStopKer2; 
cudaEventCreate( &ceEvStartCpyDev2 );
cudaEventCreate( &ceEvStopCpyDev2 );
cudaEventCreate( &ceEvStartKer2 );
cudaEventCreate( &ceEvStopKer2 );

//Scenario1: put start R1 before kernels and start R2 between kernels
cudaDeviceSynchronize();

cudaEventRecord(ceEvStart, 0);

cudaEventRecord(ceEvStartCpyDev1, csStream1);
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, csStream1);
cudaEventRecord(ceEvStopCpyDev1, csStream1);

cudaEventRecord(ceEvStartCpyDev2, csStream2);
cudaMemcpyAsync(d_A2, h_A2, size, cudaMemcpyHostToDevice, csStream2);
cudaEventRecord(ceEvStopCpyDev2, csStream2);

//insert runker1 start event before concurrent kernels
cudaStreamWaitEvent(csStream3, ceEvStopCpyDev1, 0);
cudaEventRecord(ceEvStartKer1, csStream3); 

wait_k<<<1,1,0,csStream3>>>(time_clocks);

//insert runker2 start event between concurrent kernels
cudaStreamWaitEvent(csStream4, ceEvStopCpyDev2, 0);
cudaEventRecord(ceEvStartKer2, csStream4); 

wait_k<<<1,1,0,csStream4>>>(time_clocks/div);

cudaEventRecord(ceEvStopKer2, csStream4);
cudaEventRecord(ceEvStopKer1, csStream3);

cudaEventRecord(ceEvStop, 0);
cudaDeviceSynchronize();

float fTim1;
cudaEventElapsedTime( &fTim1, ceEvStart, ceEvStop);
printf("Scenario1 overall runtime = %10f\n", fTim1);

//Scenario2: put start R1 before kernels and start R2 between kernels
cudaDeviceSynchronize();

cudaEventRecord(ceEvStart, 0);

cudaEventRecord(ceEvStartCpyDev1, csStream1);
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, csStream1);
cudaEventRecord(ceEvStopCpyDev1, csStream1);

cudaEventRecord(ceEvStartCpyDev2, csStream2);
cudaMemcpyAsync(d_A2, h_A2, size, cudaMemcpyHostToDevice, csStream2);
cudaEventRecord(ceEvStopCpyDev2, csStream2);

//insert runker1 start event before concurrent kernels
cudaStreamWaitEvent(csStream3, ceEvStopCpyDev1, 0);
cudaEventRecord(ceEvStartKer1, csStream3); 

//insert runker2 start event before concurrent kernels
cudaStreamWaitEvent(csStream4, ceEvStopCpyDev2, 0);
cudaEventRecord(ceEvStartKer2, csStream4); 

wait_k<<<1,1,0,csStream3>>>(time_clocks);

wait_k<<<1,1,0,csStream4>>>(time_clocks/div);

cudaEventRecord(ceEvStopKer2, csStream4);
cudaEventRecord(ceEvStopKer1, csStream3);

cudaEventRecord(ceEvStop, 0);
cudaDeviceSynchronize();

float fTim2;
cudaEventElapsedTime( &fTim2, ceEvStart, ceEvStop);
printf("Scenario2 overall runtime = %10f\n", fTim2);

}

int main()
{
 shorterR2_D2_R1_Overlap();
}

非常感谢您的帮助!

1 个答案:

答案 0 :(得分:0)

计算能力1.0 - 3.0有一个推送缓冲区来向GPU提交工作。工作按CUDA API调用的顺序提交。在方案2中,推送缓冲区无法执行 cudaStreamWaitEvent(csStream4,ceEvStopCpyDev2,0); 之外的命令,直到ceEvStopCpyDev2完成。

演示文稿 CUDA C / C ++ Streams and Concurrency pdf | video)包含有关此主题的更多信息。幻灯片流计划包含有关您观察到的问题的更多详细信息。