我有两项任务。它们中的每一个都执行复制到设备(D),运行内核(R)和复制到主机(H)操作。我正在使用task1(R1)的运行内核将副本重叠到task2(D2)的设备上。另外,我将task2(R2)的运行内核与copy1(H1)的主机重叠。
我还使用cudaEventRecord记录每个任务的D,R,H操作的开始和停止时间。
我有GeForce GT 555M,CUDA 4.1和Fedora 16。
我有三种情景:
场景1:我为每项任务使用一个流。我在操作之前/之后放置开始/停止事件。
场景2:我为每项任务使用一个流。我在第一个操作开始之前放置第二个重叠操作的开始事件(即在开始D2 之前放置开始R1 ,并放置开始H1 在开始R2之前)。
场景3:我为每项任务使用两个流。我使用cudaStreamWaitEvents来同步这两个流。一个流用于D和H(复制)操作,另一个用于R操作。我在操作之前/之后放置开始/停止事件。
Scenario1 无法重叠ops(D2-R1和R2-H1都不能重叠),而 Scenario2 和 Scenario3 成功。 我的问题是:为什么Scenerio1失败而其他人失败?
对于每个场景,我测量执行Task1和Task2的总时间。同时运行R1和R2每个需要5毫秒。由于 Scenario1 无法与ops重叠,因此整体时间比场景2和3 多10ms。
以下是场景的伪代码:
场景1(FAILS):将stream1用于task1,将stream2用于task2
start overall
start D1 on stream1
D1 on stream1
stop D1 on stream1
start D2 on stream2
D2 on stream2
stop D2 on stream2
start R1 on stream1
R1 on stream1
stop R1 on stream1
start R2 on stream2
R2 on stream2
stop R2 on stream2
start H1 on stream1
H1 on stream1
stop H1 on stream1
start H2 on stream2
H2 on stream2
stop H2 on stream2
stop overall
场景2(SUCCEEDS):对task1使用stream1,对task2使用stream2,向上移动第二个重叠操作的start事件。
start overall
start D1 on stream1
D1 on stream1
stop D1 on stream1
start R1 on stream1 //moved-up
start D2 on stream2
D2 on stream2
stop D2 on stream2
R1 on stream1
stop R1 on stream1
start H1 on stream1 //moved-up
start R2 on stream2
R2 on stream2
stop R2 on stream2
H1 on stream1
stop H1 on stream1
start H2 on stream2
H2 on stream2
stop H2 on stream2
stop overall
场景3(SUCCEEDS):对task1使用stream1和3,对task2使用stream2和4
start overall
start D1 on stream1
D1 on stream1
stop D1 on stream1
start D2 on stream2
D2 on stream2
stop D2 on stream2
start R1 on stream3
R1 on stream3
stop R1 on stream3
start R2 on stream4
R2 on stream4
stop R2 on stream4
start H1 on stream1
H1 on stream1
stop H1 on stream1
start H2 on stream2
H2 on stream2
stop H2 on stream2
stop overall
以下是所有方案的总体时间信息: 场景1 = 39.390240 场景2 = 29.190241 情景3 = 29.298208
我还附上下面的CUDA代码:
#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/time.h>
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
{
C[i] = A[i] + B[N-i];
C[i] = A[i] + B[i] * 2;
C[i] = A[i] + B[i] * 3;
C[i] = A[i] + B[i] * 4;
C[i] = A[i] + B[i];
}
}
void overlap()
{
float* h_A;
float *d_A, *d_C;
float* h_A2;
float *d_A2, *d_C2;
int N = 10000000;
size_t size = N * sizeof(float);
cudaMallocHost((void**) &h_A, size);
cudaMallocHost((void**) &h_A2, size);
// Allocate vector in device memory
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_C, size);
cudaMalloc((void**)&d_A2, size);
cudaMalloc((void**)&d_C2, size);
float fTimCpyDev1, fTimKer1, fTimCpyHst1, fTimCpyDev2, fTimKer2, fTimCpyHst2;
float fTimOverall3, fTimOverall1, fTimOverall2;
for (int i = 0; i<N; ++i)
{
h_A[i] = 1;
h_A2[i] = 5;
}
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
cudaStream_t csStream1, csStream2, csStream3, csStream4;
cudaStreamCreate(&csStream1);
cudaStreamCreate(&csStream2);
cudaStreamCreate(&csStream3);
cudaStreamCreate(&csStream4);
cudaEvent_t ceEvStart, ceEvStop;
cudaEventCreate( &ceEvStart );
cudaEventCreate( &ceEvStop );
cudaEvent_t ceEvStartCpyDev1, ceEvStopCpyDev1, ceEvStartKer1, ceEvStopKer1, ceEvStartCpyHst1, ceEvStopCpyHst1;
cudaEventCreate( &ceEvStartCpyDev1 );
cudaEventCreate( &ceEvStopCpyDev1 );
cudaEventCreate( &ceEvStartKer1 );
cudaEventCreate( &ceEvStopKer1 );
cudaEventCreate( &ceEvStartCpyHst1 );
cudaEventCreate( &ceEvStopCpyHst1 );
cudaEvent_t ceEvStartCpyDev2, ceEvStopCpyDev2, ceEvStartKer2, ceEvStopKer2, ceEvStartCpyHst2, ceEvStopCpyHst2;
cudaEventCreate( &ceEvStartCpyDev2 );
cudaEventCreate( &ceEvStopCpyDev2 );
cudaEventCreate( &ceEvStartKer2 );
cudaEventCreate( &ceEvStopKer2 );
cudaEventCreate( &ceEvStartCpyHst2 );
cudaEventCreate( &ceEvStopCpyHst2 );
//Scenario1
cudaDeviceSynchronize();
cudaEventRecord(ceEvStart, 0);
cudaEventRecord(ceEvStartCpyDev1, csStream1);
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, csStream1);
cudaEventRecord(ceEvStopCpyDev1, csStream1);
cudaEventRecord(ceEvStartCpyDev2, csStream2);
cudaMemcpyAsync(d_A2, h_A2, size, cudaMemcpyHostToDevice, csStream2);
cudaEventRecord(ceEvStopCpyDev2, csStream2);
cudaEventRecord(ceEvStartKer1, csStream1);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream1>>>(d_A, d_A, d_C, N);
cudaEventRecord(ceEvStopKer1, csStream1);
cudaEventRecord(ceEvStartKer2, csStream2);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream2>>>(d_A2, d_A2, d_C2, N);
cudaEventRecord(ceEvStopKer2, csStream2);
cudaEventRecord(ceEvStartCpyHst1, csStream1);
cudaMemcpyAsync(h_A, d_C, size, cudaMemcpyDeviceToHost, csStream1);
cudaEventRecord(ceEvStopCpyHst1, csStream1);
cudaEventRecord(ceEvStartCpyHst2, csStream2);
cudaMemcpyAsync(h_A2, d_C2, size, cudaMemcpyDeviceToHost, csStream2);
cudaEventRecord(ceEvStopCpyHst2, csStream2);
cudaEventRecord(ceEvStop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime( &fTimOverall1, ceEvStart, ceEvStop);
printf("Scenario1 overall time= %10f\n", fTimOverall1);
//Scenario2
cudaDeviceSynchronize();
cudaEventRecord(ceEvStart, 0);
cudaEventRecord(ceEvStartCpyDev1, csStream1);
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, csStream1);
cudaEventRecord(ceEvStopCpyDev1, csStream1);
cudaEventRecord(ceEvStartKer1, csStream1); //moved up
cudaEventRecord(ceEvStartCpyDev2, csStream2);
cudaMemcpyAsync(d_A2, h_A2, size, cudaMemcpyHostToDevice, csStream2);
cudaEventRecord(ceEvStopCpyDev2, csStream2);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream1>>>(d_A, d_A, d_C, N);
cudaEventRecord(ceEvStopKer1, csStream1);
cudaEventRecord(ceEvStartCpyHst1, csStream1); //moved up
cudaEventRecord(ceEvStartKer2, csStream2);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream2>>>(d_A2, d_A2, d_C2, N);
cudaEventRecord(ceEvStopKer2, csStream2);
cudaMemcpyAsync(h_A, d_C, size, cudaMemcpyDeviceToHost, csStream1);
cudaEventRecord(ceEvStopCpyHst1, csStream1);
cudaEventRecord(ceEvStartCpyHst2, csStream2);
cudaMemcpyAsync(h_A2, d_C2, size, cudaMemcpyDeviceToHost, csStream2);
cudaEventRecord(ceEvStopCpyHst2, csStream2);
cudaEventRecord(ceEvStop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime( &fTimOverall2, ceEvStart, ceEvStop);
printf("Scenario2 overall time= %10f\n", fTimOverall2);
//Scenario3
cudaDeviceSynchronize();
cudaEventRecord(ceEvStart, 0);
cudaEventRecord(ceEvStartCpyDev1, csStream1);
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, csStream1);
cudaEventRecord(ceEvStopCpyDev1, csStream1);
cudaEventRecord(ceEvStartCpyDev2, csStream2);
cudaMemcpyAsync(d_A2, h_A2, size, cudaMemcpyHostToDevice, csStream2);
cudaEventRecord(ceEvStopCpyDev2, csStream2);
cudaStreamWaitEvent(csStream3, ceEvStopCpyDev1, 0);
cudaEventRecord(ceEvStartKer1, csStream3);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream3>>>(d_A, d_A, d_C, N);
cudaEventRecord(ceEvStopKer1, csStream3);
cudaStreamWaitEvent(csStream4, ceEvStopCpyDev2, 0);
cudaEventRecord(ceEvStartKer2, csStream4);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream4>>>(d_A2, d_A2, d_C2, N);
cudaEventRecord(ceEvStopKer2, csStream4);
cudaStreamWaitEvent(csStream1, ceEvStopKer1, 0);
cudaEventRecord(ceEvStartCpyHst1, csStream1);
cudaMemcpyAsync(h_A, d_C, size, cudaMemcpyDeviceToHost, csStream1);
cudaEventRecord(ceEvStopCpyHst1, csStream1);
cudaStreamWaitEvent(csStream2, ceEvStopKer2, 0);
cudaEventRecord(ceEvStartCpyHst2, csStream2);
cudaMemcpyAsync(h_A2, d_C2, size, cudaMemcpyDeviceToHost, csStream2);
cudaEventRecord(ceEvStopCpyHst2, csStream2);
cudaEventRecord(ceEvStop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime( &fTimOverall3, ceEvStart, ceEvStop);
printf("Scenario3 overall time = %10f\n", fTimOverall3);
cudaStreamDestroy(csStream1);
cudaStreamDestroy(csStream2);
cudaStreamDestroy(csStream3);
cudaStreamDestroy(csStream4);
cudaFree(d_A);
cudaFree(d_C);
cudaFreeHost(h_A);
cudaFree(d_A2);
cudaFree(d_C2);
cudaFreeHost(h_A2);
}
int main()
{
overlap();
}
非常感谢您提前的时间!
答案 0 :(得分:0)
(注意,我对特斯拉系列设备比较熟悉,实际上并没有GT 555M进行试验,所以我的结果专门针对C2070。我不知道有多少复制引擎555m有,但我预计下面描述的问题是导致你看到的行为的原因。)
问题是鲜为人知的事实,即cudaEventRecords也是CUDA操作,并且在启动/执行之前它们也必须放在其中一个硬件队列中。 (一个复杂的因素是,因为cudaEventRecord既不是复制操作,也不是计算内核,它实际上可以进入任何硬件队列。我的理解是它们通常与同一流的前面的CUDA操作在同一硬件队列中,但由于未在文档中指定,实际操作可能取决于设备/驱动程序。)
如果我可以扩展您的注释,将“E”用于“事件记录”,并详细说明如何填充硬件队列(类似于“CUDA C/C++ Streams and Concurrency”网络研讨会中所做的那样),那么,在您的场景中举个例子,你有:
Issue order for CUDA operations:
ED1
D1
ED1
ED2
D2
ED2
ER1
R1
ER1
...
这些填充队列如:
Hardware Queues: copyH2D Kernel
------- ------
ED1 * R1
D1 / ER1
ED1 / ...
ED2 /
D2 /
ED2 /
ER1 *
并且您可以看到,由于在流1中,R1将在ER1完成之前不会执行,直到D1和D2都完成后才会执行,因为它们都在H2D复制队列中被序列化。 / p>
通过在场景2中移动cudaEventRecord,ER1,可以避免这种情况,因为在R1之前的流1中的所有CUDA操作都在D2之前完成。这允许R1同时启动到D2。
Hardware Queues: copyH2D Kernel
------- ------
ED1 * R1
D1 / ER1
ED1 / ...
ER1 *
ED2
D2
ED2
在您的场景3中,ER1将替换为ER3。因为这是第3流中的第一个操作,它可以去任何地方,并且(猜测)进入内核或复制D2H队列,它可以从中立即启动,(如果你没有
cudaStreamWaitEvent(csStream3, ceEvStopCpyDev1, 0);
用于与流1同步)因此它不会导致D2的错误序列化。
Hardware Queues: copyH2D Kernel
------- ------
ED1 * ER3
D1 / R3
ED1 * ER3
ED2 ...
D2
ED2
我的评论将是
但是,您应该注意到即将推出的Kepler GK110(Tesla K20)设备通过使用32个硬件队列在减少错误序列化方面做出了重大改进。有关详细信息,请参阅GK110 Whitepaper(第17页)。
希望这有帮助。