在CUDA 5.0中,NVIDIA添加了一个“纹理对象”(cudaTextureObject_t
),使纹理更容易使用。以前,有必要将纹理定义为全局变量。
我使用cudaTextureObject_t
关注了this NVIDIA example。它适用于1D情况。我试图扩展这个例子来处理2D音调内存:
#define WIDTH 6
#define HEIGHT 2
int width = WIDTH; int height = HEIGHT;
float h_buffer[12] = {1,2,3,4,5,6,7,8,9,10,11,12};
float* d_buffer;
size_t pitch;
cudaMallocPitch(&d_buffer, &pitch, sizeof(float)*width, height);
cudaMemcpy2D(d_buffer, pitch, &h_buffer, sizeof(float)*width, sizeof(float)*width, height, cudaMemcpyHostToDevice);
printf("pitch = %d \n", pitch);
//CUDA 5 texture objects: https://developer.nvidia.com/content/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d_buffer;
resDesc.res.pitch2D.pitchInBytes = pitch;
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
resDesc.res.pitch2D.desc.f = cudaChannelFormatKindFloat;
resDesc.res.pitch2D.desc.x = 32; // bits per channel
resDesc.res.pitch2D.desc.y = 32;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
为了查看数据是否确实可以通过纹理缓存访问,我在这个内核中打印了几个字节:
__global__ void printGpu_tex(cudaTextureObject_t tex) {
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if(tidx < WIDTH && tidy < HEIGHT){
float x = tex2D<float>(tex, tidy, tidx);
printf("tex2D<float>(tex, %d, %d) = %f \n", tidy, tidx, x);
}
}
我预计这个输出是“1,2,3,......,12”。但是,它打印“1,7,7,7,... 3,9,......”:
tex2D<float>(tex, 0, 0) = 1.000000
tex2D<float>(tex, 0, 1) = 7.000000
tex2D<float>(tex, 0, 2) = 7.000000
tex2D<float>(tex, 0, 3) = 7.000000
tex2D<float>(tex, 0, 4) = 7.000000
tex2D<float>(tex, 0, 5) = 7.000000
tex2D<float>(tex, 1, 0) = 3.000000
tex2D<float>(tex, 1, 1) = 9.000000
tex2D<float>(tex, 1, 2) = 9.000000
tex2D<float>(tex, 1, 3) = 9.000000
tex2D<float>(tex, 1, 4) = 9.000000
tex2D<float>(tex, 1, 5) = 9.000000
为了验证d_buffer
数据是否设置正确,我还为原始d_buffer
数组创建了“打印内核”,而不使用纹理缓存:
__global__ void printGpu_vanilla(float* d_buffer, int pitch) {
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if(tidx < WIDTH && tidy < HEIGHT){
float x = d_buffer[tidy*pitch + tidx];
printf("d_buffer[%d][%d] = %f \n", tidy, tidx, x);
}
}
输出看起来不错(与纹理缓存版本不同):
d_buffer[0][0] = 1.000000
d_buffer[0][2] = 2.000000
d_buffer[0][3] = 3.000000
d_buffer[0][4] = 4.000000
d_buffer[0][5] = 5.000000
d_buffer[0][5] = 6.000000
d_buffer[1][0] = 7.000000
d_buffer[1][6] = 8.000000
d_buffer[1][7] = 9.000000
d_buffer[1][8] = 10.000000
d_buffer[1][9] = 11.000000
d_buffer[1][5] = 12.000000
关于纹理缓存版本可能出现什么问题的任何想法?
下载:
答案 0 :(得分:3)
cudaChannelFormatDesc
resDesc.res.pitch2D.desc
中的y
错误:0
应为FormatDesc
。
设置CreateChannelDesc<>()
正确使用resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
功能,例如resDesc.res.pitch2D.desc.y = 32
,而不是手动设置。
float2
对{{1}}纹理有效。
答案 1 :(得分:-1)
除了cudaChannelFormatDesc
之外,似乎您的代码中有一个逻辑问题,这没什么大不了的,但是如果您不谨慎的话,可能会引起误解。如果您想按照CUDA线程组织成块和网格并安排换行方式(此外,如果您希望代码与C ++中“行优先”的概念保持一致),最好考虑将x
作为变化最快的维度(类似于行专业)。由于您的代码显示y
的变化速度快于x
,因此更正确的方法是切换代码中的索引:
float x = tex2D<float>(tex, tidx, tidy);
printf("tex2D<float>(tex, %d, %d) = %f \n", tidx, tidy, x);
...
printf("d_buffer[%d][%d] = %f \n", tidx, tidy, x);
再次值得一提的是,这不是一个大问题,但与此同时可能会造成很大的混乱,尤其是当您要将此内核与代码的其他部分集成时。