Question

我正在尝试在GPU中运行密集批处理QR分解，我能够成功运行CUBLAS批处理dgeqrf代码，但是当我对岩浆dgeqrf批处理代码执行相同的步骤时，我没有得到正确的输出。我将代码发布到使用岩浆dgeqrf批处理代码执行批处理QR的位置。谁能告诉我下面的代码在哪里出错？谢谢

int main() {

//mxn: size of Array[i]

const int m = 3;
const int n = 2;
const int batchSize = 2;//2 small matrices
//const int ltau = 10; //ltau = max(1,min(m,n))
int lda = 3;


double **A,**tau;

A = (double**)malloc(batchSize * sizeof(double*));
for (int i = 0; i < batchSize; i++) {
    A[i] = (double*)malloc(m*n * sizeof(double));
}


tau = (double**)malloc(batchSize * sizeof(double*));        
for (int i = 0; i < batchSize; i++) 
{
        tau[i] = (double*)malloc(n * sizeof(double));
}
  int *info;
info = (int*)malloc(batchSize * sizeof(int));



double **d_A, **h_d_A;
h_d_A = (double**)malloc(batchSize * sizeof(double*));

for (int i = 0; i < batchSize; i++) {
    cudaMalloc((void**)&h_d_A[i], m*n * sizeof(double));

}

cudaMalloc((void**)&d_A, batchSize * sizeof(double*));
cudaMemcpy(d_A, h_d_A, batchSize * sizeof(double*), cudaMemcpyHostToDevice);

for (int k = 0; k < batchSize; k++) {
    for (int j = 0; j < n; j++) {
        for (int i = 0; i < m; i++) {
            int index = j * m + i;//not tested
            if (i == j) {
                (A[k])[index] = 2;
            }
            else {
                (A[k])[index] = 1.0;

            }
        } // i  
    } // j
} // k



for (int i = 0; i < batchSize; i++)
{
    cudaMemcpy(h_d_A[i], A[i], m *n * sizeof(double), cudaMemcpyHostToDevice);

}
magma_queue_t queue; 

magma_queue_create( &queue);

magma_dgeqrf_batched( m, n, d_A, lda, tau, info, batchSize,queue);

for (int i = 0; i < batchSize; i++)
    cudaMemcpy(A[i], h_d_A[i], m*n * sizeof(double), cudaMemcpyDeviceToHost);



for (int k = 0; k < batchSize; k++) {
    for (int j = 0; j < m; j++) {
        for (int i = 0; i < n; i++) {
            int index = j * m + i;//not tested
            //count = count + 1;
            printf("\n %d The values are %lf",k+index, A[k][index]);
        } // i
    } // j
} // k

free(tau);
free(A);
free(h_d_A);

cudaFree(d_A);

}`

无法获得magma_dgeqrf_batched的正确输出

0 个答案: