Question

我有很多密集的线性系统，我想用cuBLAS Batched 格式来解决。所以我的计划是

使用 cublasDgetrfBatched 进行批量LU分解
然后逐个使用 cublasDtrsmBatched 进行批量下三角和批量上三角部分。

代码为

 #include<stdio.h>
 #include<stdlib.h>
 #include<cuda_runtime.h> 
 #include<device_launch_parameters.h>
 #include<cublas_v2.h>

 const int N = 32;
 const int Nmatrices = N;

 __global__ void initiate(double *d_A, double *d_B)
    {
     int i = threadIdx.x;       int j = blockIdx.x;

     int id = j*N*N + i*N;      int idb = j*N + i;  

     for(int k = 0; k< N ; k++)
         {
        d_A[id + k] = 0.0;

        if(k == i-2)    d_A[id + k] = 1.0; 
        if(k == i-1)    d_A[id + k] = 2.0; 
        if(k == i)      d_A[id + k] = 8.0;
        if(k == i+1)    d_A[id + k] = 2.0;
        if(k == i+2)    d_A[id + k] = 1.0;  
        } 
     d_B[idb] = 8.0;   
    }


int main()
 {
    cublasHandle_t handle;      cublasSafeCall(cublasCreate(&handle));

// Allocate device space for the input matrices
  double *d_A_sys; cudaMalloc((void**)&d_A_sys, N*N*Nmatrices*sizeof(double));
  double *d_B_sys; cudaMalloc((void**)&d_B_sys, N*Nmatrices  *sizeof(double));

// Allocate host space for the solution
  double *h_B_sys = (double *)malloc(N*Nmatrices*sizeof(double));

// kernel for initiat d_A_sys and d_B_sys
  initiate<<<Nmatrices, N>>>(d_A_sys, d_B_sys);

//Creating the array of pointers needed as input/output to the batched getrf
  double **h_A_pointers = (double **)malloc(Nmatrices*sizeof(double *));
  for (int i = 0; i < Nmatrices; i++) h_A_pointers[i] = d_A_sys + i*N*N;

  double **h_b_pointers = (double **)malloc(Nmatrices*sizeof(double *));
  for (int i = 0; i < Nmatrices; i++) h_B_pointers[i] = d_B_sys + i*N;

  double **d_A_pointers;
  cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));
  cudaMemcpy(d_A_pointers, h_A_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);


  double **d_b_pointers;
  cudaMalloc((void**)&d_b_pointers, Nmatrices*sizeof(double *));
  cudaMemcpy(d_b_pointers, h_b_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);

  int *d_InfoArrays; cudaMalloc((void**)&d_InfoArrays,  Nmatrices*sizeof(int));
  int *h_InfoArrays = (int *)malloc(Nmatrices*sizeof(int));

//Batched LU decomposition
   cublasDgetrfBatched(handle, N, d_A_pointers, N, NULL, d_InfoArrays, Nmatrices));

  //Batched Lower triangular part
  cublasDtrsmBatched(handle, 
                   CUBLAS_SIDE_LEFT, 
                   CUBLAS_FILL_MODE_LOWER,
                   CUBLAS_OP_N,
                   CUBLAS_DIAG_UNIT,
                   N,
                   N,
                   &alpha,
                   d_A_pointers,
                   N,
                   d_b_pointers,
                   N,
                   Nmatrices);

  //Batched Upper triangular part
  cublasDtrsmBatched(handle,
                   CUBLAS_SIDE_LEFT,
                   CUBLAS_FILL_MODE_UPPER,
                   CUBLAS_OP_N,
                   CUBLAS_DIAG_NON_UNIT,
                   N,
                   N,
                   &alpha,
                   d_A_pointers,
                   N,
                   d_b_pointers,
                   N,
                   Nmatrices);

  cudaMemcpy(h_B_sys, d_B_sys, N*Nmatrices*sizeof(double), cudaMemcpyDeviceToHost);
  printf("Print out the solutions \n");

  cublasDestroy(handle);
  gpuErrchk(cudaDeviceReset());
  return 0;
 }

cublasDgetrfBatched和cublasDtrsmBatched要求 d_A_pointers 应该是 double 类型，但是当我执行时，后者会给我编译错误，例如{ {3}}：

如何克服这个问题，有什么帮助？

Answer 1

您可以通过执行以下操作来解决const正确性问题：

const double **d_A_pointers;
cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));

....

//Batched LU decomposition
cublasDgetrfBatched(handle, N, const_cast<double**>(d_A_pointers), N, NULL, d_InfoArrays, Nmatrices);

即。抛弃cublasDgetrfBatched

中的常量

您在发布的代码中显然有错的另一件事是cublasDtrsmBatched调用中输入的维度。我相信你想要的东西：

//Batched Lower triangular part
cublasDtrsmBatched(handle, 
        CUBLAS_SIDE_LEFT, 
        CUBLAS_FILL_MODE_LOWER,
        CUBLAS_OP_N,
        CUBLAS_DIAG_UNIT,
        N,
        1,
        &alpha,
        d_A_pointers,
        N,
        d_b_pointers,
        N,
        Nmatrices);

即。您的示例的RHS矩阵的输入大小是Nx1，而不是NxN（您没有解决每个分解矩阵的N RHS问题，只有一个）。

您的代码中可能存在其他问题（请注意，CUBLAS，与大多数参考BLAS实现一样，默认情况下需要在列主要排序中输入），并且您在问题中发布的代码由于其他原因而未实际编译，所以不可能肯定地说出更多。

cublasDgetrfBatched和cublasDtrsmBatched的矛盾何时使用cuBLAS解决线性系统阵列

1 个答案: