将数组结构数组加载到cuda上

时间:2012-11-20 21:57:14

标签: struct cuda

我正在尝试创建一个包含数组的数组结构,然后将它们加载到GPU上。我想我按照步骤正确地做了这件事。

  1. 使用malloc在CPU上创建结构。
  2. cudamalloc数组到结构。
  3. 使用cudamalloc
  4. 在GPU上创建结构
  5. 将CPU结构复制到GPU结构上。
  6. 当我运行此代码时,只要我不更改内核函数中的值p[i].c[0],它就会正常工作。如果我删除行p[i].c[0] = 3.3;,则会输出预期结果。当我保持原样时,它输出所有值的随机数。我希望能够使用内核函数更新数组中的值。

    可能出现什么问题?

    这是我的代码:

    #include <stdio.h>
    #include <cuda_runtime.h>
    #include <iostream>
    #include <fstream>
    #include <sstream>
    #include <cstdio>
    #include <fcntl.h>
    #include <unistd.h>
    #include <assert.h>
    #include <omp.h>
    #include <vector>
    #include <sys/time.h>
    
        float cData[]
                    {
                            1,
                            2,
                            3,
                            4,
                            5,
                            6,
                            7,
                            8,
                            9,
                            10,
                            11,
                            12,
                            13,
                            14,
                            15,
                            16
                    };
        float dData[]
                    {
                            1,
                            2,
                            3,
                            4,
                            5,
                            6,
                            7,
                            8,
                            9,
                            10,
                            11,
                            12,
                            13,
                            14,
                            15,
                            16
                    };
    
        typedef struct
                {
                    float a, b;
                    float* c;
                    float* d;
                } point;
    
    __global__ void testKernel(point *p){
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        p[i].a = 1.1;
        p[i].b = 2.2;
        p[i].c[0] = 3.3;
    }
    
    void checkerror(cudaError_t error, char* descrp){
        if (error != 0){
    
            printf("%s error code: %d \n", descrp, error);
        }
    
    }
    
    extern "C" int main()
    {
        printf("starting gpuCode\n");
        int *dev_a;
                // set number of points
            int numPoints    = 16,
                gpuBlockSize = 4,
                pointSize    = sizeof(point),
                numBytes     = numPoints * pointSize,
                gpuGridSize  = numPoints / gpuBlockSize;
        cudaError_t err = cudaSuccess;
        printf("initialized variables\n");
                // allocate memory
            point *cpuPointArray,
                  *gpuPointArray,
                  *outPointArray;
            cpuPointArray = (point*)malloc(numBytes);  //create the cpuPointArray struct on the cpu
            outPointArray = (point*)malloc(numBytes);  //create the outPointArray struct on the cpu
            printf("load cpuPointArray struct with default values\n");
    
            for (int k=0; k<16; k++){
                err = cudaMalloc( (void**)&cpuPointArray[k].c, 16*sizeof(float) );
                checkerror(err, "assigning cuda pointer c");
                err = cudaMalloc( (void**)&cpuPointArray[k].d, 16*sizeof(float) );
                checkerror(err, "assigning cuda pointer d");
                cpuPointArray[k].a = 16;
                cpuPointArray[k].b = 16;
            }
    
    
            for (int k=0; k<16; k++){
                printf("top loop %d\n", k);
                err = cudaMemcpy(cpuPointArray[k].c, cData, 16*sizeof(float), cudaMemcpyHostToDevice);
                printf("after cdata\n");
                checkerror(err, "copying cdata to gpu array c" );
                err = cudaMemcpy(cpuPointArray[k].d, dData, 16*sizeof(float), cudaMemcpyHostToDevice);
                printf("after ddata\n");
                checkerror(err, "copying ddata to gpu array d");
                printf("bottom of loop %d\n", k);
            }
    
            err = cudaMalloc((void**)&gpuPointArray, numBytes);  //allocate memory on the gpu for the cpu point array
            checkerror(err, "allocating memory for gpuPointArray");
            err = cudaMemcpy(gpuPointArray,cpuPointArray,sizeof(cpuPointArray), cudaMemcpyHostToDevice); //copy the cpu point array onto the gpu
            checkerror(err, "copying cpuPointArray to gpuPointArray");
    
            printf("loaded the struct into the kernel\n");
    
            for(int i = 0; i < numPoints; ++i)
                    {
                        printf("point.a: %f, point.b: %f ************************\n",cpuPointArray[i].a,cpuPointArray[i].b);
    
                            printf("cuda mem location point.c: %d point.d: %d\n",&cpuPointArray[i].c, &cpuPointArray[i].d);
    
                    }
    
                // launch kernel
            testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);
    
            printf("returned the struct from the kernel\n");
            err = cudaMemcpy(outPointArray,gpuPointArray,numBytes, cudaMemcpyDeviceToHost);
            checkerror(err, "copying gpuPointArray to cpuPointArray");
            printf("after gpu copy to cpu\n");
            for (int k=0; k<16; k++){
                printf("creating memory on cpu for array c\n");
                outPointArray[k].c = (float*)malloc(16*sizeof(float));
                printf("creating memory on cpu for array d\n");
                outPointArray[k].d = (float*)malloc(16*sizeof(float));
                printf("copying memory values onto cpu array c\n");
                err = cudaMemcpy(outPointArray[k].c, cpuPointArray[k].c, 16*sizeof(float), cudaMemcpyDeviceToHost);
                checkerror(err, "copy array c from gpu to cpu");
                printf("copying memory values onto cpu array c\n");
                err = cudaMemcpy(outPointArray[k].d, cpuPointArray[k].d, 16*sizeof(float), cudaMemcpyDeviceToHost);
                checkerror(err, "copy array d from gpu to cpu");
                printf("bottom of loop %d\n", k);
            }
    
                // retrieve the results
    
            printf("testKernel results:\n");
            for(int i = 0; i < numPoints; ++i)
            {
                printf("point.a: %f, point.b: %f ************************\n",outPointArray[i].a,outPointArray[i].b);
                for (int j=0; j<16; j++){
                    printf("point.c: %f point.d: %f\n",outPointArray[i].c[j], outPointArray[i].d[j]);
                }
            }
    
                // deallocate memory
            free(cpuPointArray);
            cudaFree(gpuPointArray);
    
            return 0;
        }
    

1 个答案:

答案 0 :(得分:1)

您好像可能会错误地将结构数组复制到设备中。尝试更改:

err = cudaMemcpy(gpuPointArray,cpuPointArray,sizeof(cpuPointArray), cudaMemcpyHostToDevice); 

err = cudaMemcpy(gpuPointArray,cpuPointArray,numBytes, cudaMemcpyHostToDevice); 

因为cpuPointArray的类型为point *,所以sizeof(cpuPointArray)实际上会返回机器上指针的大小。你想要的是整个结构数组的大小。实际上,使用以下命令从设备复制时,它甚至看起来都是正确的。

err = cudaMemcpy(outPointArray,gpuPointArray,numBytes, cudaMemcpyDeviceToHost);

希望有所帮助!