Cublas中的批量QR功能的结果

时间:2015-07-24 18:24:37

标签: cuda cublas

我在Cublas中使用cublasDgeqrfBatched来对许多小矩阵进行QR分解。 例如,我使用4x4矩阵A

 A=  2     9     8     9
    10     2    10    10
    10     5     7     7
     5    10     1     8

cublasDgeqrfBatched的输出Array[1]batchsize=1

Array=-15.1658    0.5243    0.4660    0.5243
      -13.7151   10.7655    0.0496    0.1148
      -12.1326    7.7656    3.9365    0.4519
      -11.8688   -0.2585    5.3365    4.5371

Tauarray

Tauarray[4]=1.1319
            1.9692
            1.6609
            0.0000

Array的下半部分是指R(列主存储器中)。这被Matlab检查为:

`[Q,R]=qr(A')` gives:
    R =

  -15.1658  -13.7151  -12.1326  -11.8688 
         0   10.7655    7.7656   -0.2585 
         0         0    3.9365    5.3365 
         0         0         0    4.5371 

   Q =

   -0.1319    0.7609    0.6329    0.0560
   -0.5934   -0.5703    0.5661   -0.0467
   -0.5275    0.2569   -0.3543   -0.7282
   -0.5934    0.1729   -0.3918    0.6815

要查找Q,文档中提到:

Q[j] = H[j][1] H[j][2] . . . H[j](k), where k = min(m,n).
Each H[j][i] has the form
H[j][i] = I - tau[j] * v * v'
where tau[j] is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in Aarray[j][i+1:m,i],
and tau in TauArray[j][i]

因此对于H1,我做了:

v1=[1; Array(1,2); Array(1,3); Array(1,4)]
v1=[1;  0.5243;  0.4660;   0.5243]

H1=eye(4)-tau(1)*v1*v1'

 H1 =

   -0.1319   -0.5935   -0.5275   -0.5935
   -0.5935    0.6889   -0.2766   -0.3111
   -0.5275   -0.2766    0.7542   -0.2766
   -0.5935   -0.3111   -0.2766    0.6889

但对于H2,我试过了:

v2=[0; 1; -0.2766;  -0.3111]
H2=eye(4)-tau(1)*v2*v2'

但找不到正确的结果。 我是库达和库拉斯的初学者。你能帮帮我吗Q 这是我的代码:

int main(int argc, char* argv[])
{double h_A[4*4]={2,     9,     8,     9,
                    10,     2,    10,    10,
                    10,     5,     7,     7,
                     5,    10,     1,     8};

    int batch_count = 2;
    int m=4;
    int n=4;
    int ltau=4;//ltau = max(1,min(m,n))

    double **Aarray, **Tauarray;

    Aarray  = (double**)malloc(batch_count*sizeof(double*));


    Tauarray = (double**)malloc(batch_count*sizeof(double*));

    for(int i=0; i<batch_count; i++) {
        Aarray[i] = (double*)malloc(m*n*sizeof(double));
       Tauarray[i] = (double*)malloc(ltau*sizeof(double));
}



 // Create host pointer array to device matrix storage
    double **d_Aarray, **d_Tauarray, **h_d_Array, **h_d_Tauarray;
    h_d_Array = (double**)malloc(batch_count*sizeof(double*));
    h_d_Tauarray = (double**)malloc(batch_count*sizeof(double*));

    for(int i=0; i<batch_count; i++) {
        cudaMalloc((void**)&h_d_Array[i], m*n*sizeof(double));
        cudaMalloc((void**)&h_d_Tauarray[i], ltau*sizeof(double));
    }

    // Copy the host array of device pointers to the device
    cudaMalloc((void**)&d_Aarray, batch_count*sizeof(double*));
    cudaMalloc((void**)&d_Tauarray, batch_count*sizeof(double*));

    cudaMemcpy(d_Aarray, h_d_Array, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Tauarray, h_d_Tauarray, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
    int tmp;
    //fill Array
    int index;
     for(int k=0; k<batch_count; k++) {
        for(int j=0; j<m; j++) {
            for(int i=0; i<n; i++) {
                index = j*n + i;
                  (Aarray[k])[index] =h_A[index];

            } // i   
        } // j
    } // k
 // Create cublas instance
    cublasHandle_t handle;
    cublasCreate(&handle);
    cublasStatus_t stat;

    // Set input matrices on device
    for(int i=0; i<batch_count; i++) {
        cublascall(cublasSetMatrix(m, n, sizeof(double), Aarray[i], m, h_d_Array[i], m));
        cublascall(cublasSetVector(ltau, sizeof(double), Tauarray[i], 1, h_d_Tauarray[i], 1));
    }

    int info;
    int lda=m;
    stat=cublasDgeqrfBatched(handle, m,n, d_Aarray,lda,d_Tauarray,&info,2);
    if (stat != CUBLAS_STATUS_SUCCESS) 
    printf("\n cublasDgeqrfBatched failed");

    // Retrieve result matrix from device
    for(int i=0; i<batch_count; i++)
        {cublascall( cublasGetMatrix(m, n, sizeof(double), h_d_Array[i], m, Aarray[i], m));
         cublascall(cublasGetVector(ltau, sizeof(double),h_d_Tauarray[i], 1, Tauarray[i], 1));
    }
// Clean up resources

    for(int i=0; i<batch_count; i++) {
        free(Aarray[i]);

        cudaFree(h_d_Array[i]);
        cudaFree(h_d_Tauarray[i]);

    }

    free(Aarray);

    free(h_d_Array);
    free(h_d_Tauarray);

    cudaFree(d_Aarray);
    cudaFree(d_Tauarray);

   cublascall( cublasDestroy(handle));

}

1 个答案:

答案 0 :(得分:1)

增加每个H的tau。例如对于H1使用Tau(1)用于H2使用Tau(2)等