我正在尝试在GPU中运行密集批处理QR分解,我能够成功运行CUBLAS批处理dgeqrf代码,但是当我对岩浆dgeqrf批处理代码执行相同的步骤时,我没有得到正确的输出。我将代码发布到使用岩浆dgeqrf批处理代码执行批处理QR的位置。谁能告诉我下面的代码在哪里出错?谢谢
int main() {
//mxn: size of Array[i]
const int m = 3;
const int n = 2;
const int batchSize = 2;//2 small matrices
//const int ltau = 10; //ltau = max(1,min(m,n))
int lda = 3;
double **A,**tau;
A = (double**)malloc(batchSize * sizeof(double*));
for (int i = 0; i < batchSize; i++) {
A[i] = (double*)malloc(m*n * sizeof(double));
}
tau = (double**)malloc(batchSize * sizeof(double*));
for (int i = 0; i < batchSize; i++)
{
tau[i] = (double*)malloc(n * sizeof(double));
}
int *info;
info = (int*)malloc(batchSize * sizeof(int));
double **d_A, **h_d_A;
h_d_A = (double**)malloc(batchSize * sizeof(double*));
for (int i = 0; i < batchSize; i++) {
cudaMalloc((void**)&h_d_A[i], m*n * sizeof(double));
}
cudaMalloc((void**)&d_A, batchSize * sizeof(double*));
cudaMemcpy(d_A, h_d_A, batchSize * sizeof(double*), cudaMemcpyHostToDevice);
for (int k = 0; k < batchSize; k++) {
for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) {
int index = j * m + i;//not tested
if (i == j) {
(A[k])[index] = 2;
}
else {
(A[k])[index] = 1.0;
}
} // i
} // j
} // k
for (int i = 0; i < batchSize; i++)
{
cudaMemcpy(h_d_A[i], A[i], m *n * sizeof(double), cudaMemcpyHostToDevice);
}
magma_queue_t queue;
magma_queue_create( &queue);
magma_dgeqrf_batched( m, n, d_A, lda, tau, info, batchSize,queue);
for (int i = 0; i < batchSize; i++)
cudaMemcpy(A[i], h_d_A[i], m*n * sizeof(double), cudaMemcpyDeviceToHost);
for (int k = 0; k < batchSize; k++) {
for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
int index = j * m + i;//not tested
//count = count + 1;
printf("\n %d The values are %lf",k+index, A[k][index]);
} // i
} // j
} // k
free(tau);
free(A);
free(h_d_A);
cudaFree(d_A);
}`