tex3D()调用始终返回零

时间:2013-07-05 16:56:22

标签: cuda textures multidimensional-array

我正在尝试从CUDA C中的3D纹理获取数据,但函数调用tex3D()始终返回零。以下是相关代码:

HOST:

#define L 64
typedef uint2 splitspin_t;
texture<splitspin_t, 3> texRef;

cudaArray *arrayPointer;
cudaExtent LLLextent = make_cudaExtent(L, L, L);    
cudaChannelFormatDesc cf = cudaCreateChannelDesc<splitspin_t>();
cudaChk(cudaMalloc3DArray( &arrayPointer, &cf, LLLextent ));

cudaMemcpy3DParms params = {0};
params.extent = LLLextent;
params.kind = cudaMemcpyHostToDevice;

params.srcPtr.ptr = h; // size L*L*L*sizeof(splitspin_t) allocated by malloc
params.srcPtr.pitch = sizeof(splitspin_t) * L;
params.srcPtr.xsize = L;
params.srcPtr.ysize = L;
params.srcPos.x = 0;
params.srcPos.y = 0;
params.srcPos.z = 0;

params.dstArray = arrayPointer;
params.dstPos.x = 0;
params.dstPos.y = 0;
params.dstPos.z = 0;

cudaChk(cudaMemcpy3D( &params ));

texRef.normalized = 0;                     
texRef.filterMode = cudaFilterModePoint;      
texRef.addressMode[0] = cudaAddressModeClamp; 
texRef.addressMode[1] = cudaAddressModeClamp;
texRef.addressMode[2] = cudaAddressModeClamp;
cudaChk(cudaBindTextureToArray( texRef, arrayPointer, cf ));

cudaFreeArray(arrayPointer);

DEVICE:

 #define GX (threadIdx.x + blockIdx.x*blockDim.x)
 #define GY (threadIdx.y + blockIdx.y*blockDim.y)
 #define GZ (threadIdx.z + blockIdx.z*blockDim.z)

 printf("%lX %lx\n", tex3D(texRef, GX, GY, GZ).y, tex3D(texRef, GX, GY, GZ).x); // always prints zeros

我已经验证了h指向的内存初始化为非零。我还验证了cudaMemcpy3D是成功的,在第一个cudaMemcpy3D之后将其清零,然后使用第二个cudaMemcpy3D从arrayPointer复制回h,然后检查h然后包含与之前相同的数据。我想也许这个问题可能也是因为我使用的是非标准类型(uint2),但是将splitdepin -t的spldepin_t类型化并没有解决问题。

因此我怀疑cudaBindTextureToArray函数调用,但我看不出我在这一点上做出的任何错误。

提前致谢。

1 个答案:

答案 0 :(得分:1)

我认为你不想这样做:

cudaFreeArray(arrayPointer);

直到您的程序(或至少是执行纹理访问的内核)完成。

如果您查看其中一个cuda样本,例如simpleCubemapTexture,您会看到典型的序列是:

  1. 创建数组
  2. 复制到设备
  3. 绑定到纹理
  4. 调用正在进行纹理化的内核
  5. 释放设备阵列
  6. 此外,值返回tex3D(...)。x和.y属于int类型。如果您将长格式说明符(l)与printf一起使用,则可能会遇到令人费解的结果。

    以下代码适用于我,以上是我对您发布的内容进行的唯一两项重大更改:

    #include <stdio.h>
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    
    #define L 64
    typedef uint2 splitspin_t;
    texture<splitspin_t, 3> texRef;
    
    __global__ void my_kernel(){
    
    
     printf("%X %x\n", tex3D(texRef, 4, 4, 4).y, tex3D(texRef, 4, 4, 4).x);
    
    }
    
    int main(){
    
    splitspin_t *h, temp;
    temp.x = 16;
    temp.y = 65536;
    h=(splitspin_t *)malloc(L*L*L*sizeof(splitspin_t));
    if (h==0) {printf("malloc fail\n"); return 1;}
    for (int i=0; i< (L*L*L); i++)
      h[i] = temp;
    
    cudaArray *arrayPointer;
    cudaExtent LLLextent = make_cudaExtent(L, L, L);
    cudaChannelFormatDesc cf = cudaCreateChannelDesc<splitspin_t>();
    cudaMalloc3DArray( &arrayPointer, &cf, LLLextent );
    cudaCheckErrors("cudaMalloc3DArray");
    
    cudaMemcpy3DParms params = {0};
    params.extent = LLLextent;
    params.kind = cudaMemcpyHostToDevice;
    
    params.srcPtr.ptr = h; // size L*L*L*sizeof(splitspin_t) allocated by malloc
    params.srcPtr.pitch = sizeof(splitspin_t) * L;
    params.srcPtr.xsize = L;
    params.srcPtr.ysize = L;
    params.srcPos.x = 0;
    params.srcPos.y = 0;
    params.srcPos.z = 0;
    
    params.dstArray = arrayPointer;
    params.dstPos.x = 0;
    params.dstPos.y = 0;
    params.dstPos.z = 0;
    
    cudaMemcpy3D( &params );
    cudaCheckErrors("cudaMemcpy3D");
    
    texRef.normalized = 0;
    texRef.filterMode = cudaFilterModePoint;
    texRef.addressMode[0] = cudaAddressModeClamp;
    texRef.addressMode[1] = cudaAddressModeClamp;
    texRef.addressMode[2] = cudaAddressModeClamp;
    cudaBindTextureToArray( texRef, arrayPointer, cf );
    cudaCheckErrors("cudaBind");
    
    my_kernel<<<1,1>>>();
    cudaDeviceSynchronize();
    cudaCheckErrors("kernel");
    cudaFreeArray(arrayPointer);
    return 0;
    }
    

    当我编译并运行它时,我得到的打印输出是:

    $ ./t192
    
    10000 10
    

    我认为这是正确的。