我有一个需要2D和3D对象的代码(除了普通的1D数组)。我应该将cudaMallocPitch
用于2D对象,将cudaMalloc3D
用于3D对象,还是可以将cudaMalloc3D
用于两者?将cudaMallocPitch
用于2D cudaMalloc3D
以上是否有任何性能优势?
答案 0 :(得分:3)
正如上面的评论中所建议的,我已经从编程指南中包含的代码片段中构建了一个示例。代码报告如下。在代码中,我将cudaMallocPitch
和cudaMalloc3D
用于2D对象。
目前,我已在我的笔记本电脑卡(GeForce GT 540M)上运行算法,对于256x256
对象,时间大致相同,cudaMalloc3D
只略有增加。 24ms
的时间约为cudaMallocPitch
,24.5ms
的时间约为25ms
/ cudaMalloc3D
。
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<conio.h>
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16
#define N 256
#define M 256
/*****************/
/* CUDA MEMCHECK */
/*****************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/******************/
/* TEST KERNEL 2D */
/******************/
__global__ void test_kernel_2D(float* d_a, size_t pitch)
{
int tidx = blockIdx.x*blockDim.x+threadIdx.x;
int tidy = blockIdx.y*blockDim.y+threadIdx.y;
if ((tidx<M) && (tidy<N)) {
float* row_a = (float*)((char*)d_a + tidx*pitch);
row_a[tidy] = row_a[tidy] * row_a[tidy];
}
}
/******************/
/* TEST KERNEL 3D */
/******************/
__global__ void test_kernel_3D(cudaPitchedPtr devPitchedPtr)
{
int tidx = blockIdx.x*blockDim.x+threadIdx.x;
int tidy = blockIdx.y*blockDim.y+threadIdx.y;
char* devPtr = (char*)devPitchedPtr.ptr;
if ((tidx<M) && (tidy<N)) {
float* row = (float*)(devPtr + tidx*devPitchedPtr.pitch);
row[tidy] = row[tidy] * row[tidy];
}
}
/********/
/* MAIN */
/********/
int main()
{
float a[N][M];
float *d_a;
size_t pitch;
for (int i=0; i<N; i++)
for (int j=0; j<M; j++) {
a[i][j] = 3.f;
//printf("row %i column %i value %f \n",i,j,a[i][j]);
}
float time;
cudaEvent_t start, stop;
// --- 2D pitched allocation and host->device memcopy
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
gpuErrchk(cudaMallocPitch(&d_a,&pitch,M*sizeof(float),N));
gpuErrchk(cudaMemcpy2D(d_a,pitch,a,M*sizeof(float),M*sizeof(float),N,cudaMemcpyHostToDevice));
dim3 GridSize1(iDivUp(M,BLOCKSIZE_x),iDivUp(N,BLOCKSIZE_y));
dim3 BlockSize1(BLOCKSIZE_y,BLOCKSIZE_x);
test_kernel_2D<<<GridSize1,BlockSize1>>>(d_a,pitch);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time 2D: %3.1f ms \n", time);
//gpuErrchk(cudaMemcpy2D(a,M*sizeof(float),d_a,pitch,M*sizeof(float),N,cudaMemcpyDeviceToHost));
//for (int i=0; i<N; i++) for (int j=0; j<M; j++) printf("row %i column %i value %f\n",i,j,a[i][j]);
// --- 3D pitched allocation and host->device memcopy
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaExtent extent = make_cudaExtent(M * sizeof(float), N, 1);
cudaPitchedPtr devPitchedPtr;
gpuErrchk(cudaMalloc3D(&devPitchedPtr, extent));
cudaMemcpy3DParms p = { 0 };
p.srcPtr.ptr = a;
p.srcPtr.pitch = M * sizeof(float);
p.srcPtr.xsize = M;
p.srcPtr.ysize = N;
p.dstPtr.ptr = devPitchedPtr.ptr;
p.dstPtr.pitch = devPitchedPtr.pitch;
p.dstPtr.xsize = M;
p.dstPtr.ysize = N;
p.extent.width = M * sizeof(float);
p.extent.height = N;
p.extent.depth = 1;
p.kind = cudaMemcpyHostToDevice;
gpuErrchk(cudaMemcpy3D(&p));
dim3 GridSize2(iDivUp(M,BLOCKSIZE_x),iDivUp(N,BLOCKSIZE_y));
dim3 BlockSize2(BLOCKSIZE_y,BLOCKSIZE_x);
test_kernel_3D<<<GridSize2,BlockSize2>>>(devPitchedPtr);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time 2D: %3.1f ms \n", time);
p.srcPtr.ptr = devPitchedPtr.ptr;
p.srcPtr.pitch = devPitchedPtr.pitch;
p.dstPtr.ptr = a;
p.dstPtr.pitch = M * sizeof(float);
p.kind = cudaMemcpyDeviceToHost;
gpuErrchk(cudaMemcpy3D(&p));
//for (int i=0; i<N; i++) for (int j=0; j<M; j++) printf("row %i column %i value %f\n",i,j,a[i][j]);
getch();
return 0;
}