我正在尝试进行一些基准测试,以确保使用CUDA的统一内存(UM)方法不会损害我们的性能。
我正在执行FFT。我使用UM的一种方式,一种方式我使用cudaMalloc
我比较之后的结果并且它们都匹配(这很好)。
然而,我获得UM方法的时间约为.5ms,而cudaMalloc方式为〜.04(执行多次平均运行后)
我正在使用事件记录来完成计时。我在cufftExecC2C电话之前和之后都有一个。
此外,我添加了两个事件记录来测量任何内存传输到设备之前的时间,并在使用数据之后将其从设备中恢复。
执行此操作时,我看到UM方法需要大约1.6毫秒,而cudaMalloc方法需要大约.7。
以下是执行UM方法的代码片段:
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 1);
cudaDeviceSynchronize();
cudaEventRecord(start_kernel);
cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um += std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms, Overall: "
+ std::to_string(overallUmTime) + "\n";
cudaFree(outData);
cudaFree(inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
以下是cudaMalloc方法
cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);
cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
cudaMemcpyDefault);
cudaEventRecord(stop_after_memDtoH);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um += std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms, Overall: "
+ std::to_string(overallUmTime) + "\n";
cudaFree(outData);
cudaFree(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
使用统一内存方法加速时,我还能做些什么吗?我预计UM会变慢,但不是这么多。
我们在使用Cuda 9的redhat 7.3上使用P100
答案 0 :(得分:2)
您发布的代码存在的一个问题是,您没有对来自FFT的输出数据执行cudaMemPrefetchAsync
。根据我的测试,这会产生显着的差异。您的代码还存在其他一些问题,例如我们不会在分配有cudaFree
的指针上调用malloc
。
这是一个围绕您所展示的内容构建的完整代码。当我在CentOS7.4,CUDA 9.1,Tesla P100上运行时,在可管理存储器情况下(3.52ms)执行FFT与在非管理存储器情况下执行的FFT(3.45ms)相比,我获得了相似的时间: / p>
$ cat t43.cu
#include <cufft.h>
#include <iostream>
#include <string>
//using namespace std;
const int dataSize = 1048576*32;
void setupWave(const int ds, cufftComplex *d){
for (int i = 0; i < ds; i++){
d[i].x = 1.0f;
d[i].y = 0.0f;}
}
int main(){
cufftComplex *inData, *outData;
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 0);
cudaMemPrefetchAsync(outData, dataSize * sizeof(cufftComplex), 0);
cudaDeviceSynchronize();
cudaEventRecord(start_kernel);
cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
std::string resultString_um = std::to_string(dataSize) + " samples took " + std::to_string(umTime) + "ms, Overall: " + std::to_string(overallUmTime) + "\n";
std::cout << resultString_um;
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaFree(inData);
cudaFree(outData);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
//cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
//cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
// stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);
cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
cudaMemcpyDefault);
sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for non-UM is " << sum << std::endl;
//float umTime = 0;
//float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um = std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms, Overall: "
+ std::to_string(overallUmTime) + "\n";
std::cout << resultString_um;
free(outData);
free(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
}
$ nvcc -std=c++11 -arch=sm_60 -o t43 t43.cu -lcufft
$ ./t43
sum for UM is 3.35544e+07
33554432 samples took 3.520640ms, Overall: 221.909988
sum for non-UM is 3.35544e+07
33554432 samples took 3.456160ms, Overall: 278.099426
$