Question

我正在从主机到设备传递矩阵并尝试将其存储在gpu内存中。测试它是否真的有用我正在将第一行从gpu中的矩阵复制到主机。但是它从包装器返回后会打印出垃圾值。

C档案：

int col;
srand(time(NULL));
matrix = (int**) malloc(10*sizeof(int*));

for(int j = 0; j < 10; j++)
{
    col = 3 + (rand() % 7);
    matrix[j] = (int*) malloc(sizeof(int)*col);
    matrix[j][0] = col-1;
    for(int i = 1; i < col; i++)
    {
       matrix[j][i] = i;
    }
}

int first_row[10];
int rows = 10;
pass_matrix_kernel_wrapper(matrix, &rows); 

foo_wrapper(first_row); // get the first row of the matrix from the gpu

for(int i = 0; i < matrix[0][0]; i++)
{
    printf("%d, ", first_row[i]);
}

Cuda文件：

__shared__ int **gpu_matrix;
__shared__ int gpu_rows;


void pass_matrix_kernel_wrapper(int** matrix, int* rows)
{
    cudaMalloc((void***)(&gpu_matrix), sizeof(int*) * (*rows));
    for (int i = 0; i < *rows; i++)
    {
    int cols = matrix[i][0] + 1;
        int* temp;

        cudaMalloc( (void**)  &(temp), sizeof(int) * cols); // allocate for 1 int in    each int pointer
        cudaMemcpy(temp, matrix[i], sizeof(int) * cols, cudaMemcpyHostToDevice); // copy data
        cudaMemcpy(gpu_matrix+i, &temp, sizeof(int*) * cols, cudaMemcpyHostToDevice);
    }
}

void foo_wrapper(int* back)
{
    int* temp;
    cudaMalloc( (void**)  &(temp), sizeof(int) * 11); // allocate for 1 int in each int pointer

    test_kernel<<<1,1>>>(temp); // just checking if it works
    cudaDeviceSynchronize();

    int size = temp[0] + 1;
    cudaMemcpy(back, &temp, sizeof(int) * size, cudaMemcpyDeviceToHost);
}

__global__ void test_kernel(int* back)
{
    for(int i = 0;i < gpu_matrix[0][0] + 1;i++) // gpu_matrix[0][0] stores num of cols in row
    {
        back[i] = gpu_matrix[0][i];
    }
}

将值从主机传递到设备，反之亦然Cuda

0 个答案: