我正在从主机到设备传递矩阵并尝试将其存储在gpu内存中。 测试它是否真的有用我正在将第一行从gpu中的矩阵复制到主机。 但是它从包装器返回后会打印出垃圾值。
C档案:
int col;
srand(time(NULL));
matrix = (int**) malloc(10*sizeof(int*));
for(int j = 0; j < 10; j++)
{
col = 3 + (rand() % 7);
matrix[j] = (int*) malloc(sizeof(int)*col);
matrix[j][0] = col-1;
for(int i = 1; i < col; i++)
{
matrix[j][i] = i;
}
}
int first_row[10];
int rows = 10;
pass_matrix_kernel_wrapper(matrix, &rows);
foo_wrapper(first_row); // get the first row of the matrix from the gpu
for(int i = 0; i < matrix[0][0]; i++)
{
printf("%d, ", first_row[i]);
}
Cuda文件:
__shared__ int **gpu_matrix;
__shared__ int gpu_rows;
void pass_matrix_kernel_wrapper(int** matrix, int* rows)
{
cudaMalloc((void***)(&gpu_matrix), sizeof(int*) * (*rows));
for (int i = 0; i < *rows; i++)
{
int cols = matrix[i][0] + 1;
int* temp;
cudaMalloc( (void**) &(temp), sizeof(int) * cols); // allocate for 1 int in each int pointer
cudaMemcpy(temp, matrix[i], sizeof(int) * cols, cudaMemcpyHostToDevice); // copy data
cudaMemcpy(gpu_matrix+i, &temp, sizeof(int*) * cols, cudaMemcpyHostToDevice);
}
}
void foo_wrapper(int* back)
{
int* temp;
cudaMalloc( (void**) &(temp), sizeof(int) * 11); // allocate for 1 int in each int pointer
test_kernel<<<1,1>>>(temp); // just checking if it works
cudaDeviceSynchronize();
int size = temp[0] + 1;
cudaMemcpy(back, &temp, sizeof(int) * size, cudaMemcpyDeviceToHost);
}
__global__ void test_kernel(int* back)
{
for(int i = 0;i < gpu_matrix[0][0] + 1;i++) // gpu_matrix[0][0] stores num of cols in row
{
back[i] = gpu_matrix[0][i];
}
}