将指针数组复制到设备内存和后端(CUDA)

时间:2015-01-13 21:14:37

标签: arrays pointers cuda cublas

我正在尝试在我的玩具示例中使用cublas函数cublasSgemmBatched。在这个例子中,我首先分配尺寸为[h_AA, h_BB] [6]和5的尺寸为[h_CC] [{{}的二维数组:6 1}}]。之后,我将其复制到设备,执行1并尝试将数组cublasSgemmBatched复制回主机阵列d_CC。但是,我收到了一个错误(h_CC)与设备进行主机复制,我不确定是否正确地将数组复制到设备中:

cudaErrorLaunchFailure

因此,此代码有效,但最后int main(){ cublasHandle_t handle; cudaError_t cudaerr; cudaEvent_t start, stop; cublasStatus_t stat; const float alpha = 1.0f; const float beta = 0.0f; float **h_AA, **h_BB, **h_CC; h_AA = new float*[6]; h_BB = new float*[6]; h_CC = new float*[6]; for (int i = 0; i < 6; i++){ h_AA[i] = new float[5]; h_BB[i] = new float[5]; h_CC[i] = new float[1]; for (int j = 0; j < 5; j++){ h_AA[i][j] = j; h_BB[i][j] = j; } h_CC[i][0] = 1; } float **d_AA, **d_BB, **d_CC; cudaMalloc(&d_AA, 6 * sizeof(float*)); cudaMalloc(&d_BB, 6 * sizeof(float*)); cudaMalloc(&d_CC, 6 * sizeof(float*)); cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice); cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice); cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice); stat = cublasCreate(&handle); stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha, (const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6); cudaerr = cudaMemcpy(h_CC, d_CC, 6 * sizeof(float*), cudaMemcpyDeviceToHost); cublasDestroy(handle); } 返回cudaerr。我试图在Github上关注此示例代码。

由于

P.S。我不明白的是,cudaErrorLaunchFailure是什么以及sizeof(float*)知道每个数组需要多少内存(就像这里我只确定1维的大小一样)。

更新:我做到了!!:

cudaMalloc

1 个答案:

答案 0 :(得分:5)

所以,我想出了答案(感谢@Robert Crovella):为了创建device array of pointers to device arrays(对于批处理函数),首先应创建host array of pointers to device arrays,然后将其复制到device array of pointers to device arrays {1}}。转回主机也是如此:一个应该使用中间host array of pointers to device arrays

cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;

float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
    h_A[i] = i;
    h_B[i] = i;
}



float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
    cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
    cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
    cudaMalloc((void **)&h_CC[i], sizeof(float));
    cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
    stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha, 
             (const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
    cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 6;i++)
        cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);