如何在CUDA中为1D数组使用纹理内存

时间:2016-10-01 18:59:52

标签: cuda

我编写了下面的代码,看看如何为1D数组使用纹理内存。但是tex1D函数没有从数组中获取相应线程id的值。请更正此代码并告诉我如何有效地使用1D数组的纹理内存有效。

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }
    int main()
    {
    float *a,*b;
    float *d_a,*d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,5>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<5;i++)
        printf("%f\t",b[i]);
      cudaDestroyTextureObject(texObj); 
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }

1 个答案:

答案 0 :(得分:2)

至少有两个问题:

  1. 您只是在最后将一个浮动数量从设备复制回主机:

    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
                     ^^^^^^^^^^^^^
    

    如果要打印5个值,则应复制5个值:

    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
    
  2. 您选择了normalized coordinates

    texDesc.normalizedCoords = 1;
    

    这意味着您应该将0到1之间的浮点坐标作为索引,而不是0到4之间的整数坐标:

     b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
                                        ^^^^^^^^^^^
    

    使用类似的东西:

     b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));
    
  3. 随着这些变化,我得到了明智的结果。这是一个完全有效的代码:

    $ cat t3.cu
    #include <stdio.h>
    
    __global__ void sum(float *b,cudaTextureObject_t texObj)
    
        {
        b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));
    
        //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
        }
    
    
    int main()
        {
        float *a,*b;
        float *d_b;
        int i;
        a=(float*)malloc(sizeof(float)*5);
        b=(float*)malloc(sizeof(float)*5);
    
        for(i=0;i<5;i++)
            a[i]=i;
    
        cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
    
        cudaArray* cuArray;
        cudaMallocArray(&cuArray, &channelDesc, 5, 0);
    
        cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);
    
    
        struct cudaResourceDesc resDesc;
            memset(&resDesc, 0, sizeof(resDesc));
            resDesc.resType = cudaResourceTypeArray;
            resDesc.res.array.array = cuArray;
    
    
          struct cudaTextureDesc texDesc;
            memset(&texDesc, 0, sizeof(texDesc));
            texDesc.addressMode[0]   = cudaAddressModeWrap;
            texDesc.addressMode[1]   = cudaAddressModeWrap;
            texDesc.filterMode       = cudaFilterModeLinear;
            texDesc.readMode         = cudaReadModeElementType;
            texDesc.normalizedCoords = 1;
    
            // Create texture object
            cudaTextureObject_t texObj = 0;
            cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
    
    
        cudaMalloc(&d_b, 5* sizeof(float));
    
        sum<<<1,4>>>(d_b,texObj);
    
    
    
            // Free device memory
        cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
    
         for(i=0;i<4;i++)
            printf("%f\t",b[i]);
          printf("\n");
          cudaDestroyTextureObject(texObj);
        cudaFreeArray(cuArray);
        cudaFree(d_b);
    
            return 0;
    
        }
    $ nvcc -arch=sm_61 -o t3 t3.cu
    $ cuda-memcheck ./t3
    ========= CUDA-MEMCHECK
    0.500000        1.500000        2.500000        3.500000
    ========= ERROR SUMMARY: 0 errors
    $
    

    请注意,我确实做了一些其他更改。特别是,我调整了您的采样点以及样本数量,以选择在您拥有的5个数据点(0,1,2,3,4)中间线性插值的样本点,从而产生总输出4个数量(0.5,1.5,2.5,3.5)代表5个数据点之间的中点。

    如果您想了解有关规范化坐标索引的更多信息,请参阅the programming guide以及边框模式等其他概念。此外,有各种CUDA sample codes证明正确使用纹理。