cublasDgetrfBatched和cublasDtrsmBatched的矛盾何时使用cuBLAS解决线性系统阵列

时间:2017-10-24 19:13:16

标签: cuda gpgpu gpu-programming cublas

我有很多密集的线性系统,我想用cuBLAS Batched 格式来解决。所以我的计划是

  1. 使用 cublasDgetrfBatched 进行批量LU分解

  2. 然后逐个使用 cublasDtrsmBatched 进行批量下三角和批量上三角部分。

  3. 代码为

     #include<stdio.h>
     #include<stdlib.h>
     #include<cuda_runtime.h> 
     #include<device_launch_parameters.h>
     #include<cublas_v2.h>
    
     const int N = 32;
     const int Nmatrices = N;
    
     __global__ void initiate(double *d_A, double *d_B)
        {
         int i = threadIdx.x;       int j = blockIdx.x;
    
         int id = j*N*N + i*N;      int idb = j*N + i;  
    
         for(int k = 0; k< N ; k++)
             {
            d_A[id + k] = 0.0;
    
            if(k == i-2)    d_A[id + k] = 1.0; 
            if(k == i-1)    d_A[id + k] = 2.0; 
            if(k == i)      d_A[id + k] = 8.0;
            if(k == i+1)    d_A[id + k] = 2.0;
            if(k == i+2)    d_A[id + k] = 1.0;  
            } 
         d_B[idb] = 8.0;   
        }
    
    
    int main()
     {
        cublasHandle_t handle;      cublasSafeCall(cublasCreate(&handle));
    
    // Allocate device space for the input matrices
      double *d_A_sys; cudaMalloc((void**)&d_A_sys, N*N*Nmatrices*sizeof(double));
      double *d_B_sys; cudaMalloc((void**)&d_B_sys, N*Nmatrices  *sizeof(double));
    
    // Allocate host space for the solution
      double *h_B_sys = (double *)malloc(N*Nmatrices*sizeof(double));
    
    // kernel for initiat d_A_sys and d_B_sys
      initiate<<<Nmatrices, N>>>(d_A_sys, d_B_sys);
    
    //Creating the array of pointers needed as input/output to the batched getrf
      double **h_A_pointers = (double **)malloc(Nmatrices*sizeof(double *));
      for (int i = 0; i < Nmatrices; i++) h_A_pointers[i] = d_A_sys + i*N*N;
    
      double **h_b_pointers = (double **)malloc(Nmatrices*sizeof(double *));
      for (int i = 0; i < Nmatrices; i++) h_B_pointers[i] = d_B_sys + i*N;
    
      double **d_A_pointers;
      cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));
      cudaMemcpy(d_A_pointers, h_A_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);
    
    
      double **d_b_pointers;
      cudaMalloc((void**)&d_b_pointers, Nmatrices*sizeof(double *));
      cudaMemcpy(d_b_pointers, h_b_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);
    
      int *d_InfoArrays; cudaMalloc((void**)&d_InfoArrays,  Nmatrices*sizeof(int));
      int *h_InfoArrays = (int *)malloc(Nmatrices*sizeof(int));
    
    //Batched LU decomposition
       cublasDgetrfBatched(handle, N, d_A_pointers, N, NULL, d_InfoArrays, Nmatrices));
    
      //Batched Lower triangular part
      cublasDtrsmBatched(handle, 
                       CUBLAS_SIDE_LEFT, 
                       CUBLAS_FILL_MODE_LOWER,
                       CUBLAS_OP_N,
                       CUBLAS_DIAG_UNIT,
                       N,
                       N,
                       &alpha,
                       d_A_pointers,
                       N,
                       d_b_pointers,
                       N,
                       Nmatrices);
    
      //Batched Upper triangular part
      cublasDtrsmBatched(handle,
                       CUBLAS_SIDE_LEFT,
                       CUBLAS_FILL_MODE_UPPER,
                       CUBLAS_OP_N,
                       CUBLAS_DIAG_NON_UNIT,
                       N,
                       N,
                       &alpha,
                       d_A_pointers,
                       N,
                       d_b_pointers,
                       N,
                       Nmatrices);
    
      cudaMemcpy(h_B_sys, d_B_sys, N*Nmatrices*sizeof(double), cudaMemcpyDeviceToHost);
      printf("Print out the solutions \n");
    
      cublasDestroy(handle);
      gpuErrchk(cudaDeviceReset());
      return 0;
     }
    

    cublasDgetrfBatchedcublasDtrsmBatched要求 d_A_pointers 应该是 double 类型,但是当我执行时,后者会给我编译错误,例如{ {3}}:

    如何克服这个问题,有什么帮助?

1 个答案:

答案 0 :(得分:1)

您可以通过执行以下操作来解决const正确性问题:

const double **d_A_pointers;
cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));

....

//Batched LU decomposition
cublasDgetrfBatched(handle, N, const_cast<double**>(d_A_pointers), N, NULL, d_InfoArrays, Nmatrices);

即。抛弃cublasDgetrfBatched

中的常量

您在发布的代码中显然有错的另一件事是cublasDtrsmBatched调用中输入的维度。我相信你想要的东西:

//Batched Lower triangular part
cublasDtrsmBatched(handle, 
        CUBLAS_SIDE_LEFT, 
        CUBLAS_FILL_MODE_LOWER,
        CUBLAS_OP_N,
        CUBLAS_DIAG_UNIT,
        N,
        1,
        &alpha,
        d_A_pointers,
        N,
        d_b_pointers,
        N,
        Nmatrices);

即。您的示例的RHS矩阵的输入大小是Nx1,而不是NxN(您没有解决每个分解矩阵的N RHS问题,只有一个)。

您的代码中可能存在其他问题(请注意,CUBLAS,与大多数参考BLAS实现一样,默认情况下需要在列主要排序中输入),并且您在问题中发布的代码由于其他原因而未实际编译,所以不可能肯定地说出更多。