CUDA解决了许多“小/中”线性系统

时间:2017-07-22 21:48:00

标签: matlab cuda gpu linear-algebra gpgpu

我正试图加速使用CUDA的问题的一些背景信息:

我需要独立解决大量小/中等大小的线性系统。每个线性系统都是方形,真实,密集,可逆和非对称。这些实际上是矩阵系统,因此每个系统看起来像AX = B,其中A,X和B是(n×n)矩阵。

在上一个问题中,我问CUBLAS batch and matrix sizes,在那里我学习cuBLAS批处理操作可以为尺寸为100x100或更小的矩阵提供最佳性能。

我仍然有一个问题,因为我正在使用的矩阵有100< n< 700.因此,矩阵的大小适中,cuBLAS批处理操作不能提供最佳性能,而常规BLAS(cusolverDnDgetrf,cusolverDnDgetrs)也没有比MATLAB更好的性能(请看下面的时间)。

与MATLAB相比,我做了一些计时,用于求解单个系统,并且发现常规BLAS对于大小(4096x4096)或更大的矩阵更好。我制作一个大小为(n×n)的随机矩阵,对于n = 64,256,512,1024,4096,16384,只有分解和后退/前向求解的时间,没有PCIE上的传输。

DOUBLE PRECISION CUDA(GTX 1080ti)vs MATLAB(反斜杠)

(GPU)64:0.001157秒 (MATLAB)64:0.000205秒

(GPU)256:0.01161秒 (MATLAB)256:0.007762秒

(GPU)512:0.026348秒 (MATLAB)512:0.008550秒

(GPU)1024:0.064357秒 (MATLAB)1024:0.036280秒

(GPU)4096:0.734908秒   (MATLAB)4096:1.174442 sec

(GPU)16384:32.962229 sec(MATLAB)16384:68.691236 sec

这些时间让我得出结论,在我的矩阵上逐个迭代调用非批量反转方法将比MATLAB慢。此外,根据CUBLAS batch and matrix sizes,对于我的中等大小的矩阵,批量cuBLAS批量反转方法将无法很好地执行。

我应该考虑采用其他方法来加速使用CUDA的代码吗?或者我误解了什么?

 /*  How to use
 *      ./cuSolverDn_LinearSolver                     // Default: cholesky
 *     ./cuSolverDn_LinearSolver -R=chol -filefile>   // cholesky factorization
 *     ./cuSolverDn_LinearSolver -R=lu -file<file>     // LU with partial pivoting
 *     ./cuSolverDn_LinearSolver -R=qr -file<file>     // QR factorization
 *
 *  Remark: the absolute error on solution x is meaningless without knowing condition number of A.
 *     The relative error on residual should be close to machine zero, i.e. 1.e-15.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include <cuda_runtime.h>

#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"

#include "helper_cusolver.h"
int linearSolverLU(
    cusolverDnHandle_t handle,
    int n,
    const double *Acopy,
    int lda,
    const double *b,
    double *x)
{
    int bufferSize = 0;
    int *info = NULL;
    double *buffer = NULL;
    double *A = NULL;
    int *ipiv = NULL; // pivoting sequence
    int h_info = 0;
    double start, stop;
    double time_solve;

    checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double*)Acopy, lda, &bufferSize));

    checkCudaErrors(cudaMalloc(&info, sizeof(int)));
    checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
    checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
    checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));


    // prepare a copy of A because getrf will overwrite A with L
    checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
    checkCudaErrors(cudaMemset(info, 0, sizeof(int)));

    start = second();
    start = second();

    checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
    checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));

    if ( 0 != h_info ){
        fprintf(stderr, "Error: LU factorization failed\n");
    }

    //checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
    checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
    //checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
    checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, n, A, lda, ipiv, x, n, info));
    checkCudaErrors(cudaDeviceSynchronize());
    stop = second();

    time_solve = stop - start;
    fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);

    if (info  ) { checkCudaErrors(cudaFree(info  )); }
    if (buffer) { checkCudaErrors(cudaFree(buffer)); }
    if (A     ) { checkCudaErrors(cudaFree(A)); }
    if (ipiv  ) { checkCudaErrors(cudaFree(ipiv));}

    return 0;
}
void generate_random_dense_matrix(int M, int N, double **outA)
{
    int i, j;
    double rMax = (double)RAND_MAX;
    double *A   = (double *)malloc(sizeof(double) * M * N);

    // For each column
    for (j = 0; j < N; j++)
    {
        // For each row
        for (i = 0; i < M; i++)
        {
            double dr = (double)rand();
            A[j * M + i] = (dr / rMax) * 100.0;
        //printf("A[j * M + i] = %f \n",A[j * M + i]);
        }
    }

    *outA = A;
}

int main (int argc, char *argv[])
{
    struct testOpts opts;
    cusolverDnHandle_t handle = NULL;
    cublasHandle_t cublasHandle = NULL; // used in residual evaluation
    cudaStream_t stream = NULL;

    int rowsA = 0; // number of rows of A
    int colsA = 0; // number of columns of A
    int nnzA  = 0; // number of nonzeros of A
    int baseA = 0; // base index in CSR format
    int lda   = 0; // leading dimension in dense matrix

    // CSR(A) from I/O
    int *h_csrRowPtrA = NULL;
    int *h_csrColIndA = NULL;
    double *h_csrValA = NULL;

    double *h_A = NULL; // dense matrix from CSR(A)
    double *h_x = NULL; // a copy of d_x
    double *h_b = NULL; // b = ones(m,1)
    double *h_r = NULL; // r = b - A*x, a copy of d_r

    double *d_A = NULL; // a copy of h_A
    double *d_x = NULL; // x = A \ b
    double *d_b = NULL; // a copy of h_b
    double *d_r = NULL; // r = b - A*x

    // the constants are used in residual evaluation, r = b - A*x
    const double minus_one = -1.0;
    const double one = 1.0;

    double x_inf = 0.0;
    double r_inf = 0.0;
    double A_inf = 0.0;
    int errors = 0;


    colsA = 660;
    rowsA = colsA;
    int NN = colsA;
    int MM = rowsA;
    lda = rowsA;

    // Generate inputs
    srand(9384);  
    generate_random_dense_matrix(MM, NN, &h_A);
    generate_random_dense_matrix(MM, NN, &h_b);

    parseCommandLineArguments(argc, argv, opts);

    if (NULL == opts.testFunc)
    {
        //opts.testFunc = "chol"; // By default running Cholesky as NO solver selected with -R option.
    opts.testFunc = "lu";  
    //opts.testFunc = "qr"; 
    }

    findCudaDevice(argc, (const char **)argv);

    /*
    printf("step 1: read matrix market format\n");

    if (opts.sparse_mat_filename == NULL)
    {
        opts.sparse_mat_filename =  sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
        if (opts.sparse_mat_filename != NULL)
            printf("Using default input file [%s]\n", opts.sparse_mat_filename);
        else
            printf("Could not find gr_900_900_crg.mtx\n");
    }
    else
    {
        printf("Using input file [%s]\n", opts.sparse_mat_filename);
    }

    if (opts.sparse_mat_filename == NULL)
    {
        fprintf(stderr, "Error: input matrix is not provided\n");
        return EXIT_FAILURE;
    }

    if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,
               &nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true))
    {
        exit(EXIT_FAILURE);
    }
    baseA = h_csrRowPtrA[0]; // baseA = {0,1}

    printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);

    if ( rowsA != colsA )
    {
        fprintf(stderr, "Error: only support square matrix\n");
        exit(EXIT_FAILURE);
    }

    printf("step 2: convert CSR(A) to dense matrix\n");

    lda = opts.lda ? opts.lda : rowsA;
    if (lda < rowsA)
    {
        fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
        exit(EXIT_FAILURE);
    }
    */
    //h_A = (double*)malloc(sizeof(double)*lda*colsA);
    h_x = (double*)malloc(sizeof(double)*lda*colsA);
    //h_b = (double*)malloc(sizeof(double)*rowsA);
    h_r = (double*)malloc(sizeof(double)*lda*rowsA);
    assert(NULL != h_A);
    assert(NULL != h_x);
    assert(NULL != h_b);
    assert(NULL != h_r);

    /*
    memset(h_A, 0, sizeof(double)*lda*colsA); 
    for(int row = 0 ; row < rowsA ; row++)
    {
        const int start = h_csrRowPtrA[row  ] - baseA;
        const int end   = h_csrRowPtrA[row+1] - baseA;
        for(int colidx = start ; colidx < end ; colidx++)
        {
            const int col = h_csrColIndA[colidx] - baseA;
            const double Areg = h_csrValA[colidx];
            h_A[row + col*lda] = Areg;
        }
    }

    printf("step 3: set right hand side vector (b) to 1\n");
    for(int row = 0 ; row < rowsA ; row++)
    {
        h_b[row] = 1.0;
    }
    */
    // verify if A is symmetric or not.
    if ( 0 == strcmp(opts.testFunc, "chol") )
    {
        int issym = 1;
        for(int j = 0 ; j < colsA ; j++)
        {
            for(int i = j ; i < rowsA ; i++)
            {
                double Aij = h_A[i + j*lda];
                double Aji = h_A[j + i*lda];
                if ( Aij != Aji )
                {
                    issym = 0;
                    break;
                }
            }
        }
        if (!issym)
        {
            printf("Error: A has no symmetric pattern, please use LU or QR \n");
            exit(EXIT_FAILURE);
        }
    }

    checkCudaErrors(cusolverDnCreate(&handle));
    checkCudaErrors(cublasCreate(&cublasHandle));
    checkCudaErrors(cudaStreamCreate(&stream));

    checkCudaErrors(cusolverDnSetStream(handle, stream));
    checkCudaErrors(cublasSetStream(cublasHandle, stream));


    checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
    checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*lda*colsA));
    checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*lda*rowsA));
    checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*lda*rowsA));

    printf("step 4: prepare data on device\n");
    checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*lda*rowsA, cudaMemcpyHostToDevice));

    printf("step 5: solve A*x = b \n");
    // d_A and d_b are read-only
    if ( 0 == strcmp(opts.testFunc, "chol") )
    {
        linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
    }
    else if ( 0 == strcmp(opts.testFunc, "lu") )
    {
        //printf("hi \n");
        linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
    }
    else if ( 0 == strcmp(opts.testFunc, "qr") )
    {
        linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
    }
    else
    {
        fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
        exit(EXIT_FAILURE);
    }
    printf("step 6: evaluate residual\n");
    checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*lda*rowsA, cudaMemcpyDeviceToDevice));

    // r = b - A*x
    checkCudaErrors(cublasDgemm_v2(
        cublasHandle,
        CUBLAS_OP_N,
        CUBLAS_OP_N,
        rowsA,
        colsA,
        colsA,
        &minus_one,
        d_A,
        lda,
        d_x,
        rowsA,
        &one,
        d_r,
        rowsA));

    checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*lda*colsA, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*lda*rowsA, cudaMemcpyDeviceToHost));

    x_inf = vec_norminf(colsA, h_x);
    r_inf = vec_norminf(rowsA, h_r);
    A_inf = mat_norminf(rowsA, colsA, h_A, lda);


    printf("x[0] = %f\n", h_x[0]);
    printf("r[0] = %f\n", h_r[0]);
    printf("|b - A*x| = %E \n", r_inf);
    printf("|A| = %E \n", A_inf);
    printf("|x| = %E \n", x_inf);
    printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));

    if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
    if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
    if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }

    if (h_csrValA   ) { free(h_csrValA); }
    if (h_csrRowPtrA) { free(h_csrRowPtrA); }
    if (h_csrColIndA) { free(h_csrColIndA); }

    if (h_A) { free(h_A); }
    if (h_x) { free(h_x); }
    if (h_b) { free(h_b); }
    if (h_r) { free(h_r); }

    if (d_A) { checkCudaErrors(cudaFree(d_A)); }
    if (d_x) { checkCudaErrors(cudaFree(d_x)); }
    if (d_b) { checkCudaErrors(cudaFree(d_b)); }
    if (d_r) { checkCudaErrors(cudaFree(d_r)); }

    return 0;
}

2 个答案:

答案 0 :(得分:3)

尝试在GPU上使用两个或更多并行流(每个都有一个线性系统),这可能有助于利用GPU的更大部分。

对于时序测量和硬件利用,请使用可视化分析器而不是CPU时间测量。

另一点是,GTX(消费者)GPU在双重预制上表现相当糟糕。如果有机会,请尝试使用Tesla GPU。

答案 1 :(得分:0)

MATLAB提供了一种使用pagefun调用GPU阵列的cublas批处理接口的方法。