无法在struct - cuda c中获得正确的值赋值

时间:2014-06-23 23:56:50

标签: c++ c struct cuda malloc

我有问题。我有两个不同的代码实现,但两者都有相同的问题:数组内部的值,结构内部,数组内的值的赋值在代码中不起作用(我希望你仍然遵循我)。

以下是两段代码:

版本1:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define ITER 4000

typedef struct Map{
    int length;
    double *A;
    int *x;
    int *dx;
    int *y;
    int *dy;
    int *delta;
    int *phi;
}Map;

typedef struct Coefs{
    int length;
    double *x;
    double *dx;
    double *y;
    double *dy;
    double *delta;
    double *phi;
}Coefs;

void cudaMallocMap(Map **m, int p){
    *m = (Map*) malloc(sizeof(Map));
    (**m).length = p;
    if(p>0){
        cudaMalloc((void**)&((**m).A), p*sizeof(double));
        cudaMalloc((void**)&((**m).x), p*sizeof(int));
        cudaMalloc((void**)&((**m).dx), p*sizeof(int));
        cudaMalloc((void**)&((**m).y), p*sizeof(int));
        cudaMalloc((void**)&((**m).dy), p*sizeof(int));
        cudaMalloc((void**)&((**m).delta), p*sizeof(int));
        cudaMalloc((void**)&((**m).phi), p*sizeof(int));
    }
}

void cudaFreeMap(Map **m){
    if((**m).length > 0){
        cudaFree((**m).A);
        cudaFree((**m).x);
        cudaFree((**m).dx);
        cudaFree((**m).y);
        cudaFree((**m).dy);
        cudaFree((**m).delta);
        cudaFree((**m).phi);
    }
    free(*m);
}

void cudaMallocCoefs(Coefs **c, int iter, int p){
    if(iter>0){
        int i;
        *c = (Coefs*) malloc(p*sizeof(Coefs));
        (*c)[0].length = iter;
        for(i=0;i<p;i++){
            cudaMalloc((void**)&((*c)[i].x), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].dx), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].y), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].dy), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].delta), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].phi), iter*sizeof(double));
        }
    }
}

void cudaFreeCoefs(Coefs **c, int p){
    int i;
    for(i=0;i<p;i++){
        if((**c).length > 0){
            cudaFree((*c)[i].x);
            cudaFree((*c)[i].dx);
            cudaFree((*c)[i].y);
            cudaFree((*c)[i].dy);
            cudaFree((*c)[i].delta);
            cudaFree((*c)[i].phi);
        }
    }
    free(*c);
}

__global__ void testVals(Map *m, Coefs *c){
    m->length = 42;
    m->A[0] = 1.5;
    m->dx[20] = 5;
    c[0].delta[4] = 3.14159265;
}

int main(int argc, char **argv){
    int xSize = 31, particleCount = 1, iter = ITER;
    Map *dev_x;
    Coefs *dev_c;

    // allocate memory for the map
    cudaMallocMap(&dev_x, xSize);

    // malloc the coefficients
    cudaMallocCoefs(&dev_c, iter, particleCount);

    // cuda test kernel
    testVals<<<1, 1>>>(dev_x, dev_c);
    int testval1;
    double testval2;
    double testval3;
    int length1;
    int length2;
    cudaMemcpy(&testval1, &(dev_x->dx[20]), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval2, &(dev_x->A[0]), sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval3, &(dev_c[0].delta[4]), sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&length1, &(dev_x->length), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&length2, &(dev_c->length), sizeof(int), cudaMemcpyDeviceToHost);

    // print test results
    fprintf(stderr, "Length map: %d\nLength coefs: %d\nValue map A[0]: %lf\nValue map dx[20]: %d\nValue coefs[0] delta[4]: %lf\n", length1, length2, testval2, testval1, testval3);

    // clean up the heap and tell that the computation is finished
    cudaFreeMap(&dev_x);
    cudaFreeCoefs(&dev_c, particleCount);
    getchar();
    return 0;
}

第2版:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define ITER 4000

typedef struct Map{
    int length;
    double *A;
    int *x;
    int *dx;
    int *y;
    int *dy;
    int *delta;
    int *phi;
}Map;

typedef struct Coefs{
    int length;
    double *x;
    double *dx;
    double *y;
    double *dy;
    double *delta;
    double *phi;
}Coefs;

void cudaMallocMap(Map **m, int p){
    cudaMalloc((void**)m, sizeof(Map));
    cudaMemset(&((**m).length), p, sizeof(int));
    if(p>0){
        double **h_arr1 = (double**)malloc(sizeof(double*));
        int **h_arr2 = (int**)malloc(sizeof(int*));
        cudaMemcpy(h_arr1, &((**m).A), sizeof(double*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr1, p*sizeof(double));
        cudaMemcpy(h_arr2, &((**m).x), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).dx), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).y), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).dy), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).delta), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).phi), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        free(h_arr1);
        free(h_arr2);
    }
}

void cudaFreeMap(Map **m){
    Map h_map;
    cudaMemcpy(&h_map, *m, sizeof(Map), cudaMemcpyDeviceToHost);

    if(h_map.length > 0){
        cudaFree(h_map.A);
        cudaFree(h_map.x);
        cudaFree(h_map.dx);
        cudaFree(h_map.y);
        cudaFree(h_map.dy);
        cudaFree(h_map.delta);
        cudaFree(h_map.phi);
    }
    cudaFree(*m);
}

void cudaMallocCoefs(Coefs **c, int iter, int p){
    if(iter>0){
        int i;
        cudaMalloc((void**)c, p*sizeof(Coefs));
        for(i=0;i<p;i++){
            double **h_arr = (double**)malloc(sizeof(double*));
            cudaMemset(&((*c)[i].length), iter, sizeof(int));
            cudaMemcpy(h_arr, &((*c)[i].x), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].dx), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].y), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].dy), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].delta), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].phi), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            free(h_arr);
        }
    }
}

void cudaFreeCoefs(Coefs **c, int p){
    Coefs h_coefs;
    int i;
    for(i=0;i<p;i++){
        cudaMemcpy(&h_coefs, &((*c)[i]), sizeof(Coefs), cudaMemcpyDeviceToHost);
        if(h_coefs.length > 0){
            cudaFree(h_coefs.x);
            cudaFree(h_coefs.dx);
            cudaFree(h_coefs.y);
            cudaFree(h_coefs.dy);
            cudaFree(h_coefs.delta);
            cudaFree(h_coefs.phi);
        }
    }
    cudaFree(*c);
}

__global__ void testVals(Map *m, Coefs *c){
    m->length = 42;
    m->A[0] = 1.5;
    m->dx[20] = 5;
    c[0].delta[4] = 3.14159265;
}


int main(int argc, char **argv){
    int xSize = 31, iter = ITER, particleCount = 1;
    Map *dev_x;
    Coefs *dev_c;

    //malloc map
    cudaMallocMap(&dev_x, xSize);

    // malloc the coefficients 
    cudaMallocCoefs(&dev_c, iter, particleCount);


    // cuda test kernel
    testVals<<<1, 1>>>(dev_x, dev_c);
    int testval1;
    double testval2;
    double testval3;
    int length1;
    int length2;
    Map testmap;
    mallocMap(&testmap, xSize);
    Coefs *testcoefs;
    mallocCoefs(&testcoefs,iter, particleCount);
    cudaMemcpy(&testmap, &dev_x, sizeof(Map), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testcoefs, &dev_c, sizeof(Coefs), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval1, &(testmap.dx[20]), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval2, &(testmap.A[0]), sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval3, &(testcoefs[0].delta[4]), sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&length1, &(testmap.length), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&length2, &(testcoefs[0].length), sizeof(int), cudaMemcpyDeviceToHost);


    // print test results
    fprintf(stderr, "Length map: %d\nLength coefs: %d\nValue map A[0]: %lf\nValue map dx[20]: %d\nValue coefs[0] delta[4]: %lf\n", length1, length2, testval2, testval1, testval3);


    // clean up the heap and tell that the computation is finished
    cudaFreeMap(&dev_x);
    cudaFreeCoefs(&dev_c, particleCount);
    getchar();
    return 0;
}

我认为malloc不能按照我想要的方式工作,或者测试内核在没有通知我的情况下做错了指针,但我似乎无法找到错误的地方。 两个版本都运行没有错误并显示非常负值,它们应该是42(在内核中设置,在mallocing时覆盖值),31(在mallocing期间设置),1.5,5和3.141592(在内核中设置)

有人可以帮我解释一下如何正确地malloc包含数组的结构数组和那些数组的长度(Coefs / Map)以及如何传递它们的指针?

编辑:另外,这里是地图和coefs的图形表示:

Coefs *c: 
┌──────────────────────┬─────┬─────┬─────┬─────┬─────┬─
│ Particles/Iterations │  1  │  2  │  3  │  4  │  5  │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=4000   │     │     │     │     │     │
│ 1: c[0].    x        │ x[0]│ x[1]│ x[2]│ x[3]│ x[4]│
│            dx        │dx[0]│dx[1]│dx[2]│dx[3]│dx[4]│
│             y        │ y[0]│ y[1]│ y[2]│ y[3]│ y[4]│
│            dy        │ ... │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=4000   │     │     │     │     │     │
│ 2: c[1].    x        │     │     │     │     │     │
│            dx        │     │     │     │     │     │
│             y        │     │     │     │     │     │
│            dy        │     │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=4000   │     │     │     │     │     │
│ 3: c[2].    x        │     │     │     │     │     │
│            dx        │     │     │     │     │     │
│             y        │     │     │     │     │     │
│            dy        │     │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=4000   │     │     │     │     │     │
│ 4: c[2].    x        │     │     │     │     │     │
│            dx        │     │     │     │     │     │
│             y        │     │     │     │     │     │
│            dy        │     │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─

Map *m: 
┌──────────────────────┬─────┬─────┬─────┬─────┬─────┬─
│   Mapnr \ Mapline    │  1  │  2  │  3  │  4  │  5  │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=31     │     │     │     │     │     │
│ 1 (*m).     x        │ x[0]│ x[1]│ x[2]│ x[3]│ x[4]│
│            dx        │dx[0]│dx[1]│dx[2]│dx[3]│dx[4]│
│             y        │ y[0]│ y[1]│ y[2]│ y[3]│ y[4]│
│            dy        │ ... │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
└──────────────────────┴─────┴─────┴─────┴─────┴─────┴─

正如您所看到的,目前我只使用一张地图并使用可变数量的地图线。我还使用Coefs的可变长度数组,它具有可变长度ITER的成员(默认为4000)。这就是我在GPU上需要malloc,以便在内核中使用。

0 个答案:

没有答案