这是执行CUDA时间的标准方式:
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// Something to be timed
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("Time: %f ms\n", time);
在CUDA simpleP2P
(点对点)示例中,以这种方式执行计时:
cudaEvent_t start, stop;
float time;
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&start,eventflags);
cudaEventCreateWithFlags(&stop,eventflags);
cudaEventRecord(start,0);
// Something to be timed
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
我的问题是:
cudaEventCreateWithFlags
使用cudaEventBlockingSync
执行了时间安排?感谢。
答案 0 :(得分:2)
差不多三年后,我回答了自己的问题。
为此,我将在Concurrency in CUDA multi-GPU executions中考虑我的示例,其中强调了如何使用异步副本实现真正的多GPU并发。特别是,我会考虑该帖子的测试用例#8 。
为了清楚起见,此处报告了测试用例#8的完整代码以及分析器时间表。
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
T *d_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Breadth-first" approach - async
double *inputMatrices; gpuErrchk(cudaMallocHost(&inputMatrices, N * sizeof(double)));
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
定时异步副本 - 并发性被破坏
现在,让我们从异步副本的计时开始。一种可行的方法是使用以下代码段:
float time[numGPUs];
cudaEvent_t start[numGPUs], stop[numGPUs];
// --- "Breadth-first" approach - async
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
}
for (int k = 0; k < numGPUs; k++) printf("Elapsed time: %3.1f ms \n", time[k]);
不幸的是,这种计时方式破坏了并发性,因为可以从下面的探查器时间线中理解:
定时异步副本 - 保留并发
为避免此问题,可能会将GPU任务作为OpenMP线程启动,如下所示:
int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;
// --- "Breadth-first" approach - async
omp_set_num_threads(numGPUs);
#pragma omp parallel
{
unsigned int k = omp_get_thread_num();
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
printf("Thread nr. %i; Elapsed time: %3.1f ms \n", k, time[k]);
}
从profiler时间轴可以看出,并发性得以保留。
定时内核启动 - 并发性被破坏
在计时内核启动时会发生同样的情况。使用以下代码段,并发性将被破坏。
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
}
for (int k = 0; k < numGPUs; k++) printf("Elapsed time: %3.1f ms \n", time[k]);
启动内核的时间 - 保留并发
与上述相反,使用OpenMP,可以保留并发性。
int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;
omp_set_num_threads(numGPUs);
#pragma omp parallel
{
unsigned int k = omp_get_thread_num();
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
printf("Thread nr. %i; Elapsed time: %3.1f ms \n", k, time[k]);
}