我试图通过将纹理内存绑定到线性设备数组(而不是cudaArray)来了解如何使用纹理内存。我的代码很简单(下面)。我有一个8个数字的float *数组,我试图绑定到1D纹理,然后在我的内核函数中,我尝试读出纹理并将值放入输出数组。但是当我运行此测试时,输出数组中的所有值都为零:
输入= 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000
输出= 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
我在这里缺少什么?
texture<float, 1, cudaReadModeElementType> texInput;
__global__ void copyKernel(float*output, int n) {
for (int i = 0; i < n; i++) {
output[i] = tex1D(texInput, (float)i);
}
}
int main(int argc, char*argv[]) {
const int WIDTH = 8;
float* hInput = (float*)malloc(sizeof(float) * WIDTH);
float*hOutput = (float*)malloc(sizeof(float) * WIDTH);
for (int i = 0; i < WIDTH; i++) {
hInput[i] = (float)i;
}
float* dInput = NULL, *dOutput = NULL;
size_t offset = 0;
texInput.addressMode[0] = cudaAddressModeBorder;
texInput.addressMode[1] = cudaAddressModeBorder;
texInput.filterMode = cudaFilterModePoint;
texInput.normalized = false;
checkCudaErrors(cudaMalloc((void**)&dInput, sizeof(float)*WIDTH));
checkCudaErrors(cudaMalloc((void**)&dOutput, sizeof(float)*WIDTH));
cudaMemcpy(dInput, hInput, sizeof(float)*WIDTH, cudaMemcpyHostToDevice);
cudaBindTexture(&offset, texInput, dInput, sizeof(float)*WIDTH);
copyKernel<<<1,1>>>(dOutput, WIDTH);
cudaMemcpy(hOutput, dOutput, sizeof(float)*WIDTH, cudaMemcpyDeviceToHost);
printf("\nInput = ");
for (int i = 0; i < WIDTH; i++) {
printf("%f\t",hInput[i]);
}
printf("\nOutput = ");
for (int i = 0; i < WIDTH; i++) {
printf("%f\t",hOutput[i]);
}
return 0;
}
答案 0 :(得分:2)
根据the documentation,当基础分配是CUDA阵列时使用tex1D()
。对于线性内存绑定纹理,正确的纹理函数为tex1Dfetch()
。
对您的代码进行的修改(仅限)使其适用于我:
$ cat t1139.cu
#include <stdio.h>
#include <helper_cuda.h>
texture<float, 1, cudaReadModeElementType> texInput;
__global__ void copyKernel(float*output, int n) {
for (int i = 0; i < n; i++) {
output[i] = tex1Dfetch(texInput, i);
}
}
int main(int argc, char*argv[]) {
const int WIDTH = 8;
float* hInput = (float*)malloc(sizeof(float) * WIDTH);
float*hOutput = (float*)malloc(sizeof(float) * WIDTH);
for (int i = 0; i < WIDTH; i++) {
hInput[i] = (float)i;
}
float* dInput = NULL, *dOutput = NULL;
size_t offset = 0;
texInput.addressMode[0] = cudaAddressModeBorder;
texInput.addressMode[1] = cudaAddressModeBorder;
texInput.filterMode = cudaFilterModePoint;
texInput.normalized = false;
checkCudaErrors(cudaMalloc((void**)&dInput, sizeof(float)*WIDTH));
checkCudaErrors(cudaMalloc((void**)&dOutput, sizeof(float)*WIDTH));
cudaMemcpy(dInput, hInput, sizeof(float)*WIDTH, cudaMemcpyHostToDevice);
cudaBindTexture(&offset, texInput, dInput, sizeof(float)*WIDTH);
copyKernel<<<1,1>>>(dOutput, WIDTH);
cudaMemcpy(hOutput, dOutput, sizeof(float)*WIDTH, cudaMemcpyDeviceToHost);
printf("\nInput = ");
for (int i = 0; i < WIDTH; i++) {
printf("%f\t",hInput[i]);
}
printf("\nOutput = ");
for (int i = 0; i < WIDTH; i++) {
printf("%f\t",hOutput[i]);
}
return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1139.cu -o t1139
$ cuda-memcheck ./t1139
========= CUDA-MEMCHECK
Input = 0.000000 1.000000 2.000000 3.000000 4.0000005.000000 6.000000 7.000000
Output = 0.000000 1.000000 2.000000 3.000000 4.0000005.000000 6.000000 7.000000 ========= ERROR SUMMARY: 0 errors
$