Cuda从设备存储器创建3d纹理和cudaArray(3d)

时间:2014-07-27 13:14:01

标签: c++ c arrays cuda textures

我试图从设备阵列的一部分创建纹理3d。

要做到这一点,这些是我的步骤:

  1. malloc Device Array
  2. 写入设备阵列
  3. 创建CudaArray(3D)
  4. 将纹理绑定到CudaArray
  5. 我这样做的方式不会产生编译错误,但是当我运行cuda-memcheck时,它在尝试从纹理中获取数据时失败了。

      

    无效全局读取大小为8 ..地址0x10dfaf3a0超出范围

    这就是为什么我猜测我宣布纹理数组是错误的。 这是我如何访问纹理:

      

    tex3D(NoiseTextures [I]中,X,Y,Z)

    我正在采取上述步骤的方式:

    1.Malloc Device Array

    cudaMalloc((void **)&d_Noise, sqrSizeNoise*nNoise*sizeof(float));
    

    2.Write Device Array

    curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(gen,Seed);
    curandGenerateUniform(gen, d_Noise, sqrSizeNoise*nNoise);
    curandDestroyGenerator(gen);
    

    3 + 4.创建Cuda阵列并将其绑定到纹理(我猜错了就在这里)

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();//cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray *d_cuArr;
    cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoise,SizeNoise,SizeNoise), 0);
    cudaMemcpy3DParms copyParams = {0};
    
    //Loop for every separated Noise Texture (nNoise = 4)
    for(int i = 0; i < nNoise; i++){
    
        //initialize the textures
        NoiseTextures[i] = texture<float, 3, cudaReadModeElementType>(1,cudaFilterModeLinear,cudaAddressModeWrap,channelDesc);
    
        //Array creation
        //+(sqrSizeNoise*i) is to separate the created Noise Array into smaller parts with the size of SizeNoise^3
        copyParams.srcPtr   = make_cudaPitchedPtr(d_Noise+(sqrSizeNoise*i), SizeNoise*sizeof(float), SizeNoise, SizeNoise);
        copyParams.dstArray = d_cuArr;
        copyParams.extent   = make_cudaExtent(SizeNoise,SizeNoise,SizeNoise);
        copyParams.kind     = cudaMemcpyDeviceToDevice;
        checkCudaErrors(cudaMemcpy3D(&copyParams));
        //Array creation End
    
        //new Bind
        // set texture parameters
        NoiseTextures[i].normalized = true;                      // access with normalized texture coordinates
        NoiseTextures[i].filterMode = cudaFilterModeLinear;      // linear interpolation
        NoiseTextures[i].addressMode[0] = cudaAddressModeWrap;   // wrap texture coordinates
        NoiseTextures[i].addressMode[1] = cudaAddressModeWrap;
        NoiseTextures[i].addressMode[2] = cudaAddressModeWrap;
    
        // bind array to 3D texture
        checkCudaErrors(cudaBindTextureToArray(NoiseTextures[i], d_cuArr, channelDesc));
        //end Bind
    }
    cudaFreeArray(d_cuArr);
    

    我已将此代码段粘贴到Pastebin,因此更容易查看颜色等。 http://pastebin.com/SM3dYd38

    我希望我清楚地描述了我的问题。如果没有请评论!

    你能帮我解决这个问题吗? 谢谢你的阅读,

    Cery

    编辑: 这是一个完整的代码,您可以在自己的机器上试用它:

    #include <helper_cuda.h>  
    #include <helper_functions.h>
    #include <helper_cuda_gl.h>
    #include <texture_types.h>
    #include <cuda_runtime.h>
    #include <curand.h>
    
    static texture<float, 3, cudaReadModeElementType> NoiseTextures[4];//texture Array
    float *d_NoiseTest;//Device Array with random floats
    int SizeNoiseTest = 32;
    int sqrSizeNoiseTest = 32768;
    
    void CreateTexture();
    
    __global__ void AccesTexture(texture<float, 3, cudaReadModeElementType>* NoiseTextures)
    {
            int test = tex3D(NoiseTextures[0],threadIdx.x,threadIdx.y,threadIdx.z);//by using this the error occurs
    }
    
    int
    main(int argc, char **argv)
    {
            CreateTexture();
    }
    void CreateTexture()
    {
            //curand Random Generator (needs compiler link -lcurand)
            curandGenerator_t gen;
            cudaMalloc((void **)&d_NoiseTest, sqrSizeNoiseTest*4*sizeof(float));//Allocation of device Array
            curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
            curandSetPseudoRandomGeneratorSeed(gen,1234ULL);
            curandGenerateUniform(gen, d_NoiseTest, sqrSizeNoiseTest*4);//writing data to d_NoiseTest
            curandDestroyGenerator(gen);
    
            //cudaArray Descriptor
            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
            //cuda Array
            cudaArray *d_cuArr;
            cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0);
            cudaMemcpy3DParms copyParams = {0};
    
            //Loop for every separated Noise Texture (4 = 4)
            for(int i = 0; i < 4; i++){
    
                    //initialize the textures
                    NoiseTextures[i] = texture<float, 3, cudaReadModeElementType>(1,cudaFilterModeLinear,cudaAddressModeWrap,channelDesc);
    
                    //Array creation
                    //+(sqrSizeNoise*i) is to separate the created Noise Array into smaller parts with the size of SizeNoise^3
                    copyParams.srcPtr   = make_cudaPitchedPtr(d_NoiseTest+(sqrSizeNoiseTest*i), SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
                    copyParams.dstArray = d_cuArr;
                    copyParams.extent   = make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest);
                    copyParams.kind     = cudaMemcpyDeviceToDevice;
                    checkCudaErrors(cudaMemcpy3D(&copyParams));
                    //Array creation End
    
                    //new Bind
                    // set texture parameters
                    NoiseTextures[i].normalized = true;                      // access with normalized texture coordinates
                    NoiseTextures[i].filterMode = cudaFilterModeLinear;      // linear interpolation
                    NoiseTextures[i].addressMode[0] = cudaAddressModeWrap;   // wrap texture coordinates
                    NoiseTextures[i].addressMode[1] = cudaAddressModeWrap;
                    NoiseTextures[i].addressMode[2] = cudaAddressModeWrap;
    
                    // bind array to 3D texture
                    checkCudaErrors(cudaBindTextureToArray(NoiseTextures[i], d_cuArr, channelDesc));
                    //end Bind
            }
            cudaFreeArray(d_cuArr);
    
            AccesTexture<<<1,dim3(4,4,4)>>>(NoiseTextures);
    }
    

    你需要链接-lcurand。并包括CUDA-6.0 / samples / common / inc

    我现在在此代码中出现了不同的错误

      

    code = 11(cudaErrorInvalidValue)&#34; cudaMemcpy3D(&amp; copyParams)&#34;

    即使它与我的原始代码完全相同。 - 我开始变得完全糊涂了。 谢谢你的帮助

2 个答案:

答案 0 :(得分:4)

这是一个工作示例,显示了一个纹理对象数组的创建,大致遵循您提供的代码的路径。通过比较我放置here的纹理参考代码,您可以看到第一个纹理对象的第一组纹理读取(即第一个内核调用)与来自第一个纹理对象的读取集合的数值相同。纹理参考示例(您可能需要调整两个示例代码的网格大小以匹配)。

纹理对象的使用需要3.0或更高的计算能力。

示例:

$ cat t507.cu
#include <helper_cuda.h>
#include <curand.h>
#define NUM_TEX 4

const int SizeNoiseTest = 32;
const int cubeSizeNoiseTest = SizeNoiseTest*SizeNoiseTest*SizeNoiseTest;
static cudaTextureObject_t texNoise[NUM_TEX];

__global__ void AccesTexture(cudaTextureObject_t my_tex)
{
        float test = tex3D<float>(my_tex,(float)threadIdx.x,(float)threadIdx.y,(float)threadIdx.z);//by using this the error occurs
        printf("thread: %d,%d,%d, value: %f\n", threadIdx.x, threadIdx.y, threadIdx.z, test);
}

void CreateTexture()
{

    float *d_NoiseTest;//Device Array with random floats
    cudaMalloc((void **)&d_NoiseTest, cubeSizeNoiseTest*sizeof(float));//Allocation of device Array
    for (int i = 0; i < NUM_TEX; i++){
        //curand Random Generator (needs compiler link -lcurand)
        curandGenerator_t gen;
        curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen,1235ULL+i);
        curandGenerateUniform(gen, d_NoiseTest, cubeSizeNoiseTest);//writing data to d_NoiseTest
        curandDestroyGenerator(gen);

        //cudaArray Descriptor
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        //cuda Array
        cudaArray *d_cuArr;
        checkCudaErrors(cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0));
        cudaMemcpy3DParms copyParams = {0};


        //Array creation
        copyParams.srcPtr   = make_cudaPitchedPtr(d_NoiseTest, SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
        copyParams.dstArray = d_cuArr;
        copyParams.extent   = make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest);
        copyParams.kind     = cudaMemcpyDeviceToDevice;
        checkCudaErrors(cudaMemcpy3D(&copyParams));
        //Array creation End

        cudaResourceDesc    texRes;
        memset(&texRes, 0, sizeof(cudaResourceDesc));
        texRes.resType = cudaResourceTypeArray;
        texRes.res.array.array  = d_cuArr;
        cudaTextureDesc     texDescr;
        memset(&texDescr, 0, sizeof(cudaTextureDesc));
        texDescr.normalizedCoords = false;
        texDescr.filterMode = cudaFilterModeLinear;
        texDescr.addressMode[0] = cudaAddressModeClamp;   // clamp
        texDescr.addressMode[1] = cudaAddressModeClamp;
        texDescr.addressMode[2] = cudaAddressModeClamp;
        texDescr.readMode = cudaReadModeElementType;
        checkCudaErrors(cudaCreateTextureObject(&texNoise[i], &texRes, &texDescr, NULL));}
}

int main(int argc, char **argv)
{
        CreateTexture();
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[0]);
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[1]);
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[2]);
        checkCudaErrors(cudaPeekAtLastError());
        checkCudaErrors(cudaDeviceSynchronize());
        return 0;
}

编译:

$ nvcc -arch=sm_30 -I/shared/apps/cuda/CUDA-v6.0.37/samples/common/inc -lcurand -o t507 t507.cu

输出:

$ cuda-memcheck ./t507
========= CUDA-MEMCHECK
thread: 0,0,0, value: 0.310691
thread: 1,0,0, value: 0.627906
thread: 0,1,0, value: 0.638900
thread: 1,1,0, value: 0.665186
thread: 0,0,1, value: 0.167465
thread: 1,0,1, value: 0.565227
thread: 0,1,1, value: 0.397606
thread: 1,1,1, value: 0.503013
thread: 0,0,0, value: 0.809163
thread: 1,0,0, value: 0.795669
thread: 0,1,0, value: 0.808565
thread: 1,1,0, value: 0.847564
thread: 0,0,1, value: 0.853998
thread: 1,0,1, value: 0.688446
thread: 0,1,1, value: 0.733255
thread: 1,1,1, value: 0.649379
thread: 0,0,0, value: 0.040824
thread: 1,0,0, value: 0.087417
thread: 0,1,0, value: 0.301392
thread: 1,1,0, value: 0.298669
thread: 0,0,1, value: 0.161962
thread: 1,0,1, value: 0.316443
thread: 0,1,1, value: 0.452077
thread: 1,1,1, value: 0.477722
========= ERROR SUMMARY: 0 errors

在这种情况下,我使用相同的内核(多次调用)来从各个纹理对象中读取。应该可以将多个对象传递到同一个内核,但是如果在代码中可以避免,则不建议从多个纹理中读取一个 warp 。实际问题存在于四级,我不想进入。最好是你可以安排代码,以便在任何给定的周期内从同一个纹理对象读取warp。

请注意,为了简化演示,此CreateTexture()函数在循环处理期间覆盖先前分配的设备指针,例如d_cuArr。这不是非法或功能问题,但它会增加内存泄漏的可能性。

我假设您可以修改代码以处理那些问题的解除分配。此代码的目的是演示使事情正常工作的方法。

答案 1 :(得分:0)

在cudaMalloc3DArray中,它应该是这样的make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest)而不是 make_cudaExtent(SizeNoiseTest *的sizeof(浮动),SizeNoiseTest,SizeNoiseTest)