带有旋转的LU后的cublasDtrsm

时间:2015-07-27 14:07:18

标签: cuda lapack cublas

我遇到了一个小问题。我必须解决一个线性系统A * x = b

矩阵A通过LU分解(LAPACK)分解。结果我得到了分解矩阵和pivotarray。之后,我想用U * x = y解决GPU上的两个线性系统:L * y = b*cublasDtrsm*。但由于dgetrfLAPACK的行交换,我必须将枢轴数组传递给cublas。但是*cublasDtrsm* - 函数并没有为此提供一些东西。没有枢轴阵列我得到错误的结果。

我已经在LAPACK中搜索过禁用旋转,但是关于稳定性,它是不可能的。有没有提示如何用LU因子分解求解线性方程组?

2 个答案:

答案 0 :(得分:0)

如果你想使用这种特殊方法(在LAPACK getrf之后的cublas trsm),我相信你应该能够通过重新排列你的b向量(或矩阵)以匹配重新排列来使用带有LAP的L,U输出的cublas trsm在旋转期间执行LAPACK的顺序。我相信此订单在LAPACK documentation中的ipiv公式中给出:

  

IPIV
            IPIV是INTEGER数组,维度(min(M,N))             枢纽指数;对于1< = i< = min(M,N),第i行             矩阵与行IPIV(i)互换。

这是一个示例代码,演示了使用单个RHS向量的简单3x3测试用例的想法:

$ cat t853.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define cudacall(call)                                                                                                          \
    do                                                                                                                          \
    {                                                                                                                           \
        cudaError_t err = (call);                                                                                               \
        if(cudaSuccess != err)                                                                                                  \
        {                                                                                                                       \
            fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
            cudaDeviceReset();                                                                                                  \
            exit(EXIT_FAILURE);                                                                                                 \
        }                                                                                                                       \
    }                                                                                                                           \
    while (0)

#define cublascall(call)                                                                                        \
    do                                                                                                          \
    {                                                                                                           \
        cublasStatus_t status = (call);                                                                         \
        if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
        {                                                                                                       \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset();                                                                                  \
            exit(EXIT_FAILURE);                                                                                 \
        }                                                                                                       \
                                                                                                                \
    }                                                                                                           \
    while(0)


void LU_device(float *src_d, int n, int *pivot)
{
    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));

    int batchSize = 1;

    int *P, *INFO;

    cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
    cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));

    int lda = n;

    float *A[] = { src_d };
    float **A_d;
    cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
    cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));

    cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));

    int INFOh = 0;
    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
    cudacall(cudaMemcpy(pivot,P,n*batchSize*sizeof(int),cudaMemcpyDeviceToHost));
#ifdef DEBUG_PRINT
    for (int qq = 0; qq < n*batchSize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); }
#endif

    if(INFOh == n)
    {
        fprintf(stderr, "Factorization Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
    cudaFree(P); cudaFree(INFO); cudaFree(A_d); cublasDestroy(handle);
}

void LU(float* src, float* L, float *U, int n, int *pivot)
{
    float *src_d;

    cudacall(cudaMalloc<float>(&src_d, n*n * sizeof(float)));
    cudacall(cudaMemcpy(src_d,src,n*n * sizeof(float),cudaMemcpyHostToDevice));

    LU_device(src_d,n,pivot);

    cudacall(cudaMemcpy(L,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
    cudacall(cudaMemcpy(U,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
    for (int i = 0; i < n; i ++){
      for (int j = 0; j < i; j++)   L[i*n+j] = 0.0;
      for (int j = i+1; j < n; j++) U[i*n+j] = 0.0;}

    cudaFree(src_d);
}

void rearrange(float *vec, int *pivot, int n, int dir){
#define DIR_FORWARD 0
#define DIR_REVERSE 1
#define SWAP(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;}
  if (dir == DIR_FORWARD)
    for (int i = 0; i < n; i++)    SWAP((vec+i),(vec+pivot[i]-1))
  else
    for (int i = n-1; i >= 0; i--) SWAP((vec+i),(vec+pivot[i]-1))
}


void TRSM(float *A, float *x, float *b, int n, cublasFillMode_t uplo, cublasDiagType_t diagt ){

    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));
    float *A_d, *b_d;
    cudacall(cudaMalloc<float>(&A_d, n*n * sizeof(float)));
    cudacall(cudaMalloc<float>(&b_d, n * sizeof(float)));
    cudacall(cudaMemcpy(b_d, b,   n*sizeof(float), cudaMemcpyHostToDevice));
    cudacall(cudaMemcpy(A_d, A, n*n*sizeof(float), cudaMemcpyHostToDevice));
    const float alpha = 1.0f;
    cublascall(cublasStrsm(handle, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, diagt, n, 1, &alpha, A_d, n, b_d, n));
    cudacall(cudaMemcpy(x, b_d, n*sizeof(float), cudaMemcpyDeviceToHost));
    cudaFree(A_d); cudaFree(b_d); cublasDestroy(handle);
}

void test_solve()
{
 // solve Ax=b
 // 1. Perform LU on A
 // 2. using pivot sequence, rearrange b -> b'
 // 3. perform TRSM on Ly=b'
 // 4. perform TRSM on Ux=y
 // A = |0 1  4 |
 //     |3 3  9 |
 //     |4 10 16|
 // x = |1|
 //     |2|
 //     |3|
 // b = |14|
 //     |36|
 //     |72|

    const int n = 3;

// has 3,2,3 pivot order
    float          A_col_major[n*n] = { 0, 3, 4,
                                        1, 3, 10,
                                        4, 9, 16 };
    float b1[n] = {14, 36, 72};
/* another example - has 3,3,3 pivot order
    float          A_transpose[n*n] = { 0, 1,  4,
                                        3, 3,  9,
                                        4, 10, 16 };
    float b2[n] = {18, 37, 70};
*/
    float result_x[n];
    int pivot[n];
    float L[n*n];
    float U[n*n];
    float y[n];

    //Select matrix by setting "a"
    float *a = A_col_major;
    float *b = b1;

    printf("Input:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",a[i*n+j]);
        printf("\n");
    }

    printf("\n\n");
// 1. LU on A
    LU(a,L,U,n,pivot);
#ifdef DEBUG_PRINT
    printf("L:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",L[i*n+j]);
        printf("\n");
    }

    printf("\n\n");
    printf("U:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",U[i*n+j]);
        printf("\n");
    }

    printf("\n\n");

#endif
// 2. Rearrange b
    rearrange(b,pivot,n,DIR_FORWARD);
#ifdef DEBUG_PRINT
   for (int i = 0; i < n; i++) printf("b'[%d] = %f\n", i, b[i]);
#endif
// 3. TRSM on Ly=b
    TRSM(L, y, b, n, CUBLAS_FILL_MODE_LOWER, CUBLAS_DIAG_UNIT);
// 4. TRSM on Ux=y
    TRSM(U, result_x, y, n, CUBLAS_FILL_MODE_UPPER, CUBLAS_DIAG_NON_UNIT);

    fprintf(stdout, "Solution:\n\n");
    for(int i=0; i<n; i++)
    {
            printf("%f\n",result_x[i]);
    }

}

int main()
{
    test_solve();

    return 0;
}

$ nvcc -o t853 t853.cu -lcublas
$ ./t853
Input:

0.000000        3.000000        4.000000
1.000000        3.000000        10.000000
4.000000        9.000000        16.000000


Solution:

1.000000
2.000000
3.000000
$

请注意,对于这个简单的测试用例,我使用了cublas getrfBatched进行矩阵LU分解,而不是LAPACK,但我认为它应该与LAPACK类似。

另请注意,我不打算评论“线性系统解决方案的最佳方法”,而只是解释您绘制的方法可能如何起作用。

答案 1 :(得分:0)

对于GPU上的置换,可以在给定向量之外创建置换矩阵,并在GPU上将其与B相乘。实际上,来自LAPACK的置换向量意味着交换步骤的顺序。因此,如果for-loop触摸了第n行,则永远不会再次触摸它。因此,一个小算法从P的向量中创建一个置换矩阵*<T>getrf*。这样就可以解决线性系统L * U * X = P * B。这导致了正确的结果。

void
permutationMatrix ( int const rows,      //number of rows of A
                    int const cols,      //number of cols of A
                    int* permArray,      //permutation vector from LAPACK
                    double* permMatrix)  //Memory for permutation matrix
{

    int tempPerm [rows];    //holds where the ones later shall be in the Matrix
    int swap;               //variable for swapping

    memset(permMatrix,0, rows * cols * sizeof(double)); //fill permutation Matrix with 0s
    memset(tempPerm,0, rows * sizeof(int)); //fill temporary memory with 0s

    for (int row = 0; row < rows; row ++)  
    {
        //start value for each temp field is the row-number
        if (tempPerm [row] == 0)
        {
            tempPerm [row] = row + 1;
        }

        /* rows need to be swapped if rownumber != number 
         * in permutation vector of LAPACK*/
        if (permArray[row] != row + 1)
        {
            //swap with a line which hasn't already swapped
            if (tempPerm[permArray[row]-1] == 0)
            {
                tempPerm[permArray[row]-1] = tempPerm[row];
                tempPerm[row] = permArray[row];
            }else{

                //swap with an already touched line
                swap = tempPerm[permArray[row]-1];
                tempPerm[permArray[row]-1] = tempPerm[row];
                tempPerm[row] = swap;
            }
        }

        //put the one in place in the permutation matrix
        permMatrix[row + (tempPerm[row]-1) * rows] = 1.0;
    }
}