Question

在LAPACK文档中，它声明DSGESV（或复数的ZCGESV）是：

dsgesv和zcgesv是混合精度迭代细化   用于开发快速单精度硬件的子程序。他们先   尝试以单精度（dsgesv）或单个精度对矩阵进行分解   复杂精度（zcgesv）并在一个中使用此分解   迭代细化程序以产生双重解决方案   精度（dsgesv）/双复数精度（zcgesv）normwise   向后错误质量（见下文）。如果方法失败，则该方法   切换到双精度或双精度   分解并计算解决方案。

迭代细化不是一个成功的策略，如果   比率单精度性能超过双精度性能   太小。一个合理的策略应该采取的数量   右侧和矩阵的大小考虑在内。这有可能   将来打电话给ilaenv。目前，迭代   精炼实施。

但我怎么知道单精度性能与双精度性能的比率是多少？有人建议考虑矩阵的大小，但我不知道矩阵的大小究竟如何导致估计这个性能比。

有人能澄清这些事吗？

Answer 1

我的猜测是，最好的方法是同时测试dgesv()和dsgesv() ......

查看Lapack函数dsgesv()的源代码，以下是dsgesv()尝试执行的内容：

将矩阵A投射到浮动As
致电sgetrf()：LU分解，单精度
通过调用As.x=b

sgetrs()

再次使用r=b-Ax计算双精度残差As.x'=r并解决sgetrs()，添加x=x+x'。

重复最后一步，直到达到双精度（最多30次迭代）。定义成功的标准是：

$||r||_\infty\leq||x||_\infty.||A||_\infty.\varepsilon.n$

其中 $\varepsilon$ 是双精度浮点数（约1e-13）的精度， $n$ 是矩阵的大小。如果失败，dsgesv()将恢复为dgesv()，因为它会调用dgetrf()（因子分解），然后调用dgetrs()。因此dsgesv()是混合精度算法。例如，请参阅this article。

最后，对于少量右侧和大型矩阵 ，预计dsgesv()的效果会优于dgesv()，即成本时间因子分析sgetrf() / dgetrf() $O(n^3)$ 远远高于替换sgetrs() / dgetrs() $O(n^2.n_{rhs})$ 。由于dsgesv()中设置的最大迭代次数为30，因此近似限制为

$n\geq30.n_{rhs}$

此外，sgetrf()必须明显快于dgetrf()。由于有限的可用内存带宽或向量处理，sgetrf()可能更快（寻找SIMD，例如来自SSE：指令ADDPS）。

可以测试iter的参数dsgesv()以检查迭代细化是否有用。如果是负数，则迭代细化失败，使用dsgesv()只是浪费时间！

以下是要比较的C代码和时间dgesv()，sgesv()，dsgesv()。它可以由gcc main.c -o main -llapacke -llapack -lblas编译。随意测试你自己的矩阵！

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>

#include <lapacke.h>

int main(void){

    srand (time(NULL));

    //size of the matrix
    int n=2000;
    // number of right-hand size
    int nb=3;

    int nbrun=1000*100*100/n/n;

    //memory initialization
    double *aaa=malloc(n*n*sizeof(double));
    if(aaa==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    double *aa=malloc(n*n*sizeof(double));
    if(aa==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    double *bbb=malloc(n*nb*sizeof(double));
    if(bbb==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    double *x=malloc(n*nb*sizeof(double));
    if(x==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    double *bb=malloc(n*nb*sizeof(double));
    if(bb==NULL){fprintf(stderr,"malloc failed\n");exit(1);}

    float *aaas=malloc(n*n*sizeof(float));
    if(aaas==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    float *aas=malloc(n*n*sizeof(float));
    if(aas==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    float *bbbs=malloc(n*n*sizeof(float));
    if(bbbs==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    float *bbs=malloc(n*nb*sizeof(float));
    if(bbs==NULL){fprintf(stderr,"malloc failed\n");exit(1);}

    int *ipiv=malloc(n*nb*sizeof(int));
    if(ipiv==NULL){fprintf(stderr,"malloc failed\n");exit(1);}

    int i,j;

    //matrix initialization
    double cond=1e3;
    for(i=0;i<n;i++){
        for(j=0;j<n;j++){
            if(j==i){
                aaa[i*n+j]=pow(cond,(i+1)/(double)n);
            }else{
                aaa[i*n+j]=1.9*(rand()/(double)RAND_MAX-0.5)*pow(cond,(i+1)/(double)n)/(double)n;
                //aaa[i*n+j]=(rand()/(double)RAND_MAX-0.5)/(double)n;
                //aaa[i*n+j]=0;
            }
        }
        bbb[i]=i;
    }

    for(i=0;i<n;i++){
        for(j=0;j<n;j++){
            aaas[i*n+j]=aaa[i*n+j];
        }
        bbbs[i]=bbb[i];
    }

    int k=0;
    int ierr;


    //estimating the condition number of the matrix
    memcpy(aa,aaa,n*n*sizeof(double));
    double anorm;
    double rcond;
    //anorm=LAPACKE_dlange( LAPACK_ROW_MAJOR, 'i', n,n, aa, n);
    double work[n];
    anorm=LAPACKE_dlange_work(LAPACK_ROW_MAJOR, 'i', n, n, aa, n, work );
    ierr=LAPACKE_dgetrf( LAPACK_ROW_MAJOR, n, n,aa, n,ipiv );
    if(ierr<0){LAPACKE_xerbla( "LAPACKE_dgetrf", ierr );}
    ierr=LAPACKE_dgecon(LAPACK_ROW_MAJOR, 'i', n,aa, n,anorm,&rcond );
    if(ierr<0){LAPACKE_xerbla( "LAPACKE_dgecon", ierr );}
    printf("condition number is %g\n",anorm,1./rcond);

    //testing dgesv()
    clock_t t;
    t = clock();
    for(k=0;k<nbrun;k++){

        memcpy(bb,bbb,n*nb*sizeof(double));
        memcpy(aa,aaa,n*n*sizeof(double));



        ierr=LAPACKE_dgesv(LAPACK_ROW_MAJOR,n,nb,aa,n,ipiv,bb,nb);
        if(ierr<0){LAPACKE_xerbla( "LAPACKE_dgesv", ierr );}

    }

    //testing sgesv()
    t = clock() - t;
    printf ("dgesv()x%d took me %d clicks (%f seconds).\n",nbrun,t,((float)t)/CLOCKS_PER_SEC);

    t = clock();
    for(k=0;k<nbrun;k++){

        memcpy(bbs,bbbs,n*nb*sizeof(float));
        memcpy(aas,aaas,n*n*sizeof(float));



        ierr=LAPACKE_sgesv(LAPACK_ROW_MAJOR,n,nb,aas,n,ipiv,bbs,nb);
        if(ierr<0){LAPACKE_xerbla( "LAPACKE_sgesv", ierr );}

    }

    //testing dsgesv()
    t = clock() - t;
    printf ("sgesv()x%d took me %d clicks (%f seconds).\n",nbrun,t,((float)t)/CLOCKS_PER_SEC);

    int iter;
    t = clock();
    for(k=0;k<nbrun;k++){

        memcpy(bb,bbb,n*nb*sizeof(double));
        memcpy(aa,aaa,n*n*sizeof(double));


        ierr=LAPACKE_dsgesv(LAPACK_ROW_MAJOR,n,nb,aa,n,ipiv,bb,nb,x,nb,&iter);
        if(ierr<0){LAPACKE_xerbla( "LAPACKE_dsgesv", ierr );}

    }
    t = clock() - t;
    printf ("dsgesv()x%d took me %d clicks (%f seconds).\n",nbrun,t,((float)t)/CLOCKS_PER_SEC);

    if(iter>0){
        printf("iterative refinement has succeded, %d iterations\n");
    }else{
        printf("iterative refinement has failed due to");
        if(iter==-1){
            printf(" implementation- or machine-specific reasons\n");
        }
        if(iter==-2){
            printf(" overflow in iterations\n");
        }
        if(iter==-3){
            printf(" failure of single precision factorization sgetrf() (ill-conditionned?)\n");
        }
        if(iter==-31){
            printf(" max number of iterations\n");
        }
    }
    free(aaa);
    free(aa);
    free(bbb);
    free(bb);
    free(x);


    free(aaas);
    free(aas);
    free(bbbs);
    free(bbs);

    free(ipiv);

    return 0;
}

n = 2000的输出：

条件号是1475.26

dgesv（）x2花了我5260000次点击（5.260000秒）。

sgesv（）x2花了我3560000次点击（3.560000秒）。

dsgesv（）x2花了我3790000次点击（3.790000秒）。

迭代细化已经成功，11次迭代

何时使用dsgesv与dgesv来求解线性方程组

1 个答案: