我在Cublas中使用cublasDgeqrfBatched
来对许多小矩阵进行QR分解。
例如,我使用4x4
矩阵A
:
A= 2 9 8 9
10 2 10 10
10 5 7 7
5 10 1 8
cublasDgeqrfBatched
的输出Array[1]
为batchsize=1
Array=-15.1658 0.5243 0.4660 0.5243
-13.7151 10.7655 0.0496 0.1148
-12.1326 7.7656 3.9365 0.4519
-11.8688 -0.2585 5.3365 4.5371
Tauarray
:
Tauarray[4]=1.1319
1.9692
1.6609
0.0000
Array的下半部分是指R(列主存储器中)。这被Matlab检查为:
`[Q,R]=qr(A')` gives:
R =
-15.1658 -13.7151 -12.1326 -11.8688
0 10.7655 7.7656 -0.2585
0 0 3.9365 5.3365
0 0 0 4.5371
和
Q =
-0.1319 0.7609 0.6329 0.0560
-0.5934 -0.5703 0.5661 -0.0467
-0.5275 0.2569 -0.3543 -0.7282
-0.5934 0.1729 -0.3918 0.6815
要查找Q
,文档中提到:
Q[j] = H[j][1] H[j][2] . . . H[j](k), where k = min(m,n).
Each H[j][i] has the form
H[j][i] = I - tau[j] * v * v'
where tau[j] is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in Aarray[j][i+1:m,i],
and tau in TauArray[j][i]
因此对于H1
,我做了:
v1=[1; Array(1,2); Array(1,3); Array(1,4)]
v1=[1; 0.5243; 0.4660; 0.5243]
H1=eye(4)-tau(1)*v1*v1'
H1 =
-0.1319 -0.5935 -0.5275 -0.5935
-0.5935 0.6889 -0.2766 -0.3111
-0.5275 -0.2766 0.7542 -0.2766
-0.5935 -0.3111 -0.2766 0.6889
但对于H2
,我试过了:
v2=[0; 1; -0.2766; -0.3111]
H2=eye(4)-tau(1)*v2*v2'
但找不到正确的结果。
我是库达和库拉斯的初学者。你能帮帮我吗Q
这是我的代码:
int main(int argc, char* argv[])
{double h_A[4*4]={2, 9, 8, 9,
10, 2, 10, 10,
10, 5, 7, 7,
5, 10, 1, 8};
int batch_count = 2;
int m=4;
int n=4;
int ltau=4;//ltau = max(1,min(m,n))
double **Aarray, **Tauarray;
Aarray = (double**)malloc(batch_count*sizeof(double*));
Tauarray = (double**)malloc(batch_count*sizeof(double*));
for(int i=0; i<batch_count; i++) {
Aarray[i] = (double*)malloc(m*n*sizeof(double));
Tauarray[i] = (double*)malloc(ltau*sizeof(double));
}
// Create host pointer array to device matrix storage
double **d_Aarray, **d_Tauarray, **h_d_Array, **h_d_Tauarray;
h_d_Array = (double**)malloc(batch_count*sizeof(double*));
h_d_Tauarray = (double**)malloc(batch_count*sizeof(double*));
for(int i=0; i<batch_count; i++) {
cudaMalloc((void**)&h_d_Array[i], m*n*sizeof(double));
cudaMalloc((void**)&h_d_Tauarray[i], ltau*sizeof(double));
}
// Copy the host array of device pointers to the device
cudaMalloc((void**)&d_Aarray, batch_count*sizeof(double*));
cudaMalloc((void**)&d_Tauarray, batch_count*sizeof(double*));
cudaMemcpy(d_Aarray, h_d_Array, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
cudaMemcpy(d_Tauarray, h_d_Tauarray, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
int tmp;
//fill Array
int index;
for(int k=0; k<batch_count; k++) {
for(int j=0; j<m; j++) {
for(int i=0; i<n; i++) {
index = j*n + i;
(Aarray[k])[index] =h_A[index];
} // i
} // j
} // k
// Create cublas instance
cublasHandle_t handle;
cublasCreate(&handle);
cublasStatus_t stat;
// Set input matrices on device
for(int i=0; i<batch_count; i++) {
cublascall(cublasSetMatrix(m, n, sizeof(double), Aarray[i], m, h_d_Array[i], m));
cublascall(cublasSetVector(ltau, sizeof(double), Tauarray[i], 1, h_d_Tauarray[i], 1));
}
int info;
int lda=m;
stat=cublasDgeqrfBatched(handle, m,n, d_Aarray,lda,d_Tauarray,&info,2);
if (stat != CUBLAS_STATUS_SUCCESS)
printf("\n cublasDgeqrfBatched failed");
// Retrieve result matrix from device
for(int i=0; i<batch_count; i++)
{cublascall( cublasGetMatrix(m, n, sizeof(double), h_d_Array[i], m, Aarray[i], m));
cublascall(cublasGetVector(ltau, sizeof(double),h_d_Tauarray[i], 1, Tauarray[i], 1));
}
// Clean up resources
for(int i=0; i<batch_count; i++) {
free(Aarray[i]);
cudaFree(h_d_Array[i]);
cudaFree(h_d_Tauarray[i]);
}
free(Aarray);
free(h_d_Array);
free(h_d_Tauarray);
cudaFree(d_Aarray);
cudaFree(d_Tauarray);
cublascall( cublasDestroy(handle));
}
答案 0 :(得分:1)
增加每个H的tau。例如对于H1使用Tau(1)用于H2使用Tau(2)等