我有问题。我有两个不同的代码实现,但两者都有相同的问题:数组内部的值,结构内部,数组内的值的赋值在代码中不起作用(我希望你仍然遵循我)。
以下是两段代码:
版本1:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define ITER 4000
typedef struct Map{
int length;
double *A;
int *x;
int *dx;
int *y;
int *dy;
int *delta;
int *phi;
}Map;
typedef struct Coefs{
int length;
double *x;
double *dx;
double *y;
double *dy;
double *delta;
double *phi;
}Coefs;
void cudaMallocMap(Map **m, int p){
*m = (Map*) malloc(sizeof(Map));
(**m).length = p;
if(p>0){
cudaMalloc((void**)&((**m).A), p*sizeof(double));
cudaMalloc((void**)&((**m).x), p*sizeof(int));
cudaMalloc((void**)&((**m).dx), p*sizeof(int));
cudaMalloc((void**)&((**m).y), p*sizeof(int));
cudaMalloc((void**)&((**m).dy), p*sizeof(int));
cudaMalloc((void**)&((**m).delta), p*sizeof(int));
cudaMalloc((void**)&((**m).phi), p*sizeof(int));
}
}
void cudaFreeMap(Map **m){
if((**m).length > 0){
cudaFree((**m).A);
cudaFree((**m).x);
cudaFree((**m).dx);
cudaFree((**m).y);
cudaFree((**m).dy);
cudaFree((**m).delta);
cudaFree((**m).phi);
}
free(*m);
}
void cudaMallocCoefs(Coefs **c, int iter, int p){
if(iter>0){
int i;
*c = (Coefs*) malloc(p*sizeof(Coefs));
(*c)[0].length = iter;
for(i=0;i<p;i++){
cudaMalloc((void**)&((*c)[i].x), iter*sizeof(double));
cudaMalloc((void**)&((*c)[i].dx), iter*sizeof(double));
cudaMalloc((void**)&((*c)[i].y), iter*sizeof(double));
cudaMalloc((void**)&((*c)[i].dy), iter*sizeof(double));
cudaMalloc((void**)&((*c)[i].delta), iter*sizeof(double));
cudaMalloc((void**)&((*c)[i].phi), iter*sizeof(double));
}
}
}
void cudaFreeCoefs(Coefs **c, int p){
int i;
for(i=0;i<p;i++){
if((**c).length > 0){
cudaFree((*c)[i].x);
cudaFree((*c)[i].dx);
cudaFree((*c)[i].y);
cudaFree((*c)[i].dy);
cudaFree((*c)[i].delta);
cudaFree((*c)[i].phi);
}
}
free(*c);
}
__global__ void testVals(Map *m, Coefs *c){
m->length = 42;
m->A[0] = 1.5;
m->dx[20] = 5;
c[0].delta[4] = 3.14159265;
}
int main(int argc, char **argv){
int xSize = 31, particleCount = 1, iter = ITER;
Map *dev_x;
Coefs *dev_c;
// allocate memory for the map
cudaMallocMap(&dev_x, xSize);
// malloc the coefficients
cudaMallocCoefs(&dev_c, iter, particleCount);
// cuda test kernel
testVals<<<1, 1>>>(dev_x, dev_c);
int testval1;
double testval2;
double testval3;
int length1;
int length2;
cudaMemcpy(&testval1, &(dev_x->dx[20]), sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&testval2, &(dev_x->A[0]), sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&testval3, &(dev_c[0].delta[4]), sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&length1, &(dev_x->length), sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&length2, &(dev_c->length), sizeof(int), cudaMemcpyDeviceToHost);
// print test results
fprintf(stderr, "Length map: %d\nLength coefs: %d\nValue map A[0]: %lf\nValue map dx[20]: %d\nValue coefs[0] delta[4]: %lf\n", length1, length2, testval2, testval1, testval3);
// clean up the heap and tell that the computation is finished
cudaFreeMap(&dev_x);
cudaFreeCoefs(&dev_c, particleCount);
getchar();
return 0;
}
第2版:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define ITER 4000
typedef struct Map{
int length;
double *A;
int *x;
int *dx;
int *y;
int *dy;
int *delta;
int *phi;
}Map;
typedef struct Coefs{
int length;
double *x;
double *dx;
double *y;
double *dy;
double *delta;
double *phi;
}Coefs;
void cudaMallocMap(Map **m, int p){
cudaMalloc((void**)m, sizeof(Map));
cudaMemset(&((**m).length), p, sizeof(int));
if(p>0){
double **h_arr1 = (double**)malloc(sizeof(double*));
int **h_arr2 = (int**)malloc(sizeof(int*));
cudaMemcpy(h_arr1, &((**m).A), sizeof(double*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr1, p*sizeof(double));
cudaMemcpy(h_arr2, &((**m).x), sizeof(int*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr2, p*sizeof(int));
cudaMemcpy(h_arr2, &((**m).dx), sizeof(int*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr2, p*sizeof(int));
cudaMemcpy(h_arr2, &((**m).y), sizeof(int*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr2, p*sizeof(int));
cudaMemcpy(h_arr2, &((**m).dy), sizeof(int*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr2, p*sizeof(int));
cudaMemcpy(h_arr2, &((**m).delta), sizeof(int*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr2, p*sizeof(int));
cudaMemcpy(h_arr2, &((**m).phi), sizeof(int*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr2, p*sizeof(int));
free(h_arr1);
free(h_arr2);
}
}
void cudaFreeMap(Map **m){
Map h_map;
cudaMemcpy(&h_map, *m, sizeof(Map), cudaMemcpyDeviceToHost);
if(h_map.length > 0){
cudaFree(h_map.A);
cudaFree(h_map.x);
cudaFree(h_map.dx);
cudaFree(h_map.y);
cudaFree(h_map.dy);
cudaFree(h_map.delta);
cudaFree(h_map.phi);
}
cudaFree(*m);
}
void cudaMallocCoefs(Coefs **c, int iter, int p){
if(iter>0){
int i;
cudaMalloc((void**)c, p*sizeof(Coefs));
for(i=0;i<p;i++){
double **h_arr = (double**)malloc(sizeof(double*));
cudaMemset(&((*c)[i].length), iter, sizeof(int));
cudaMemcpy(h_arr, &((*c)[i].x), sizeof(double*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr, iter*sizeof(double));
cudaMemcpy(h_arr, &((*c)[i].dx), sizeof(double*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr, iter*sizeof(double));
cudaMemcpy(h_arr, &((*c)[i].y), sizeof(double*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr, iter*sizeof(double));
cudaMemcpy(h_arr, &((*c)[i].dy), sizeof(double*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr, iter*sizeof(double));
cudaMemcpy(h_arr, &((*c)[i].delta), sizeof(double*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr, iter*sizeof(double));
cudaMemcpy(h_arr, &((*c)[i].phi), sizeof(double*), cudaMemcpyDeviceToHost);
cudaMalloc((void**)h_arr, iter*sizeof(double));
free(h_arr);
}
}
}
void cudaFreeCoefs(Coefs **c, int p){
Coefs h_coefs;
int i;
for(i=0;i<p;i++){
cudaMemcpy(&h_coefs, &((*c)[i]), sizeof(Coefs), cudaMemcpyDeviceToHost);
if(h_coefs.length > 0){
cudaFree(h_coefs.x);
cudaFree(h_coefs.dx);
cudaFree(h_coefs.y);
cudaFree(h_coefs.dy);
cudaFree(h_coefs.delta);
cudaFree(h_coefs.phi);
}
}
cudaFree(*c);
}
__global__ void testVals(Map *m, Coefs *c){
m->length = 42;
m->A[0] = 1.5;
m->dx[20] = 5;
c[0].delta[4] = 3.14159265;
}
int main(int argc, char **argv){
int xSize = 31, iter = ITER, particleCount = 1;
Map *dev_x;
Coefs *dev_c;
//malloc map
cudaMallocMap(&dev_x, xSize);
// malloc the coefficients
cudaMallocCoefs(&dev_c, iter, particleCount);
// cuda test kernel
testVals<<<1, 1>>>(dev_x, dev_c);
int testval1;
double testval2;
double testval3;
int length1;
int length2;
Map testmap;
mallocMap(&testmap, xSize);
Coefs *testcoefs;
mallocCoefs(&testcoefs,iter, particleCount);
cudaMemcpy(&testmap, &dev_x, sizeof(Map), cudaMemcpyDeviceToHost);
cudaMemcpy(&testcoefs, &dev_c, sizeof(Coefs), cudaMemcpyDeviceToHost);
cudaMemcpy(&testval1, &(testmap.dx[20]), sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&testval2, &(testmap.A[0]), sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&testval3, &(testcoefs[0].delta[4]), sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&length1, &(testmap.length), sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&length2, &(testcoefs[0].length), sizeof(int), cudaMemcpyDeviceToHost);
// print test results
fprintf(stderr, "Length map: %d\nLength coefs: %d\nValue map A[0]: %lf\nValue map dx[20]: %d\nValue coefs[0] delta[4]: %lf\n", length1, length2, testval2, testval1, testval3);
// clean up the heap and tell that the computation is finished
cudaFreeMap(&dev_x);
cudaFreeCoefs(&dev_c, particleCount);
getchar();
return 0;
}
我认为malloc不能按照我想要的方式工作,或者测试内核在没有通知我的情况下做错了指针,但我似乎无法找到错误的地方。 两个版本都运行没有错误并显示非常负值,它们应该是42(在内核中设置,在mallocing时覆盖值),31(在mallocing期间设置),1.5,5和3.141592(在内核中设置)
有人可以帮我解释一下如何正确地malloc包含数组的结构数组和那些数组的长度(Coefs / Map)以及如何传递它们的指针?
编辑:另外,这里是地图和coefs的图形表示:
Coefs *c:
┌──────────────────────┬─────┬─────┬─────┬─────┬─────┬─
│ Particles/Iterations │ 1 │ 2 │ 3 │ 4 │ 5 │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│ length=4000 │ │ │ │ │ │
│ 1: c[0]. x │ x[0]│ x[1]│ x[2]│ x[3]│ x[4]│
│ dx │dx[0]│dx[1]│dx[2]│dx[3]│dx[4]│
│ y │ y[0]│ y[1]│ y[2]│ y[3]│ y[4]│
│ dy │ ... │ │ │ │ │
│ delta │ │ │ │ │ │
│ phi │ │ │ │ │ │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│ length=4000 │ │ │ │ │ │
│ 2: c[1]. x │ │ │ │ │ │
│ dx │ │ │ │ │ │
│ y │ │ │ │ │ │
│ dy │ │ │ │ │ │
│ delta │ │ │ │ │ │
│ phi │ │ │ │ │ │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│ length=4000 │ │ │ │ │ │
│ 3: c[2]. x │ │ │ │ │ │
│ dx │ │ │ │ │ │
│ y │ │ │ │ │ │
│ dy │ │ │ │ │ │
│ delta │ │ │ │ │ │
│ phi │ │ │ │ │ │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│ length=4000 │ │ │ │ │ │
│ 4: c[2]. x │ │ │ │ │ │
│ dx │ │ │ │ │ │
│ y │ │ │ │ │ │
│ dy │ │ │ │ │ │
│ delta │ │ │ │ │ │
│ phi │ │ │ │ │ │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
Map *m:
┌──────────────────────┬─────┬─────┬─────┬─────┬─────┬─
│ Mapnr \ Mapline │ 1 │ 2 │ 3 │ 4 │ 5 │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│ length=31 │ │ │ │ │ │
│ 1 (*m). x │ x[0]│ x[1]│ x[2]│ x[3]│ x[4]│
│ dx │dx[0]│dx[1]│dx[2]│dx[3]│dx[4]│
│ y │ y[0]│ y[1]│ y[2]│ y[3]│ y[4]│
│ dy │ ... │ │ │ │ │
│ delta │ │ │ │ │ │
│ phi │ │ │ │ │ │
└──────────────────────┴─────┴─────┴─────┴─────┴─────┴─
正如您所看到的,目前我只使用一张地图并使用可变数量的地图线。我还使用Coefs的可变长度数组,它具有可变长度ITER的成员(默认为4000)。这就是我在GPU上需要malloc,以便在内核中使用。