Question

我最近编写了一个基于“单面Jacobi旋转”算法的并行SVD分解程序。代码工作正常，但速度非常慢。实际上它应该利用内部for循环for(int g=0;g<n;g++)中的并行性，但是在评论#pragma omp paralell for指令时，我可以理解只是性能的轻微下降。换句话说，并行上没有明显的加速（代码与4个线程并行运行）。

注1：几乎所有工作都集中在以下三个涉及矩阵A和V的循环中，这些循环相对较大。

for(h=0;h<N;h++)
{
    p+=A[h+N*i]*A[h+N*j];//columns dot product:Ai * Aj
    qi+=A[h+N*i]*A[h+N*i];// ||Ai||^2
    qj+=A[h+N*j]*A[h+N*j];// ||Aj||^2
}

和

double Ahi,Vhi;
for(h=0;h<N;h++)//...rotate Ai & Aj (only columns i & j are changend)
{
    Ahi=A[h+N*i];
    A[h+N*i]=cs*A[h+N*i]+sn*A[h+N*j];
    A[h+N*j]=-sn*Ahi+cs*A[h+N*j];
}
//store & update rotation matrix V (only columns i & j are updated)
for(h=0;h<N;h++)
{
    Vhi=V[h+N*i];
    V[h+N*i]=cs*V[h+N*i]+sn*V[h+N*j];
    V[h+N*j]=-sn*Vhi+cs*V[h+N*j];
}

所有的并行性都应该在那里被利用，但事实并非如此。我无法理解为什么。

注2：在Windows（cygWin编译器）和Linux（GCC）平台上也会发生同样的情况。

注3：矩阵由列主阵列表示

所以我正在寻找一些帮助，找出为什么没有利用并行性。我错过了什么？并列中有一些隐藏的开销，我无法看到？

非常感谢您的任何建议

int sweep(double* A,double*V,int N,double tol)
{
static int*I=new int[(int)ceil(0.5*(N-1))];
static int*J=new int[(int)ceil(0.5*(N-1))];
int ntol=0;
for(int r=0;r<N;r++) //fill in i,j indexes of parallel rotations in vectors     I & J
{
    int k=r+1;
    if (k==N)
    {
    for(int i=2;i<=(int)ceil(0.5*N);i++){
        I[i-2]=i-1;
        J[i-2]=N+2-i-1;
        }
    }
    else
    {
        for(int i=1;i<=(int)ceil(0.5*(N-k));i++)I[i-1]=i-1;
        for(int i=1;i<=(int)ceil(0.5*(N-k));i++)J[i-1]=N-k+2-i-1;
        if(k>2)
        {
            int j=(int)ceil(0.5*(N-k));
            for(int i=N-k+2;i<=N-(int)floor(0.5*k);i++){
                I[j]=i-1;
                J[j]=2*N-k+2-i-1;
                j++;
            }
        }
    }

    int n=(k%2==0)?(int)floor(0.5*(N-1)):(int)floor(0.5*N);

    #pragma omp parallel for schedule(dynamic,5) reduction(+:ntol)  default(none)   shared(std::cout,I,J,A,V,N,n,tol)
    for(int g=0;g<n;g++)
    {
        int i=I[g];
        int j=J[g];
        double p=0;
        double qi=0;
        double qj=0;
        double cs,sn,q,c;
        int h;
        for(h=0;h<N;h++)
        {
            p+=A[h+N*i]*A[h+N*j];//columns dot product:Ai * Aj
            qi+=A[h+N*i]*A[h+N*i];// ||Ai||^2
            qj+=A[h+N*j]*A[h+N*j];// ||Aj||^2
        }
        q=qi-qj;
        if(p*p/(qi*qj)<tol) ntol++; //if Ai & Aj are orthogonal enough...
        else                        //if Ai & Aj are not orthogonal enough then... rotate them
        {
            c=sqrt(4*p*p+q*q);
            if(q>=0){
                cs=sqrt((c+q)/(2*c));
                sn=p/(c*cs);
            }
            else{
                sn=(p>=0)?sqrt((c-q)/2/c):-sqrt((c-q)/2/c);
                cs=p/(c*sn);
            }
            //...rotate Ai & Aj (only columns i & j are changend)
            double Ahi,Vhi;
            for(h=0;h<N;h++)
            {
                Ahi=A[h+N*i];
                A[h+N*i]=cs*A[h+N*i]+sn*A[h+N*j];
                A[h+N*j]=-sn*Ahi+cs*A[h+N*j];
            }
            //store & update rotation matrix V (only columns i & j are updated)
            for(h=0;h<N;h++)
            {
                Vhi=V[h+N*i];
                V[h+N*i]=cs*V[h+N*i]+sn*V[h+N*j];
                V[h+N*j]=-sn*Vhi+cs*V[h+N*j];
            }
        }
    }
}
if(2*ntol==(N*(N-1)))return(1);//if each columns of A is orthogonal enough     to each other stop sweep

return(0);
}

Answer 1

感谢Z boson的言论，我设法编写了一个性能更好的并行SVD分解。它比原来的运行速度快很多倍，我想通过使用SIMD指令仍可以改进它。我发布代码，如果有人应该发现它的使用。我执行的所有测试都给出了正确的结果，在任何情况下都不保证其使用。我真的很抱歉没有时间正确评论代码以便更好地理解。

int sweep(double* A,double*V,int N,double tol,int M, int n)
{
/********************************************************************************************
This routine performs a parallel "sweep" of the SVD factorization algorithm for the matrix A.
It implements a single sided Jacobi rotation algorithm, described by Michael W. Berry,
Dani Mezher, Bernard Philippe, and Ahmed Sameh in "Parallel Algorithms for the Singular Value
Decomposition".
At each sweep the A matrix becomes a little more orthogonal, until each column of A is orthogonal
to each other within a give tolerance. At this point the sweep() routine returns 1 and convergence
is attained.

Arguments:

A   : on input the square matrix to be orthogonalized, on exit a more "orthogonal" matrix
V   : on input the accumulated rotation matrix, on exit it is updated with the current rotations
N   : dimension of the matrices.
tol :tolerance for convergence of orthogonalization. See the explainations for SVD() routine
M   : number of blocks (it is calculated from the given block size n)
n   : block size (number of columns on each block). It should be an optimal value according to the hardware available

returns : 1 if in the last sweep convergence is attained, 0 if not and one more sweep is necessary

Author : Renato Talucci 2015
*************************************************************************************************/

#include <math.h>
int ntol=0;
//STEP 1 : INTERNAL BLOCK ORTHOGONALISATION
#pragma omp paralell for reduction(+:ntol) shared(A,V,n,tol,N) default(none)
for(int a=0;a<M;a++)
{
    for(int i=a*n;i<a*n+imin(n,N-a*n)-1;i++)
    {
        for(int j=i+1;j<a*n+imin(n,N-a*n);j++)
        {
            double p=0;
            double qi=0;
            double qj=0;
            double cs,sn,q,c;
            for(int h=0;h<N;h++)
            {
                p+=A[h+N*i]*A[h+N*j];//columns dot product:Ai * Aj
                qi+=A[h+N*i]*A[h+N*i];// ||Ai||^2
                qj+=A[h+N*j]*A[h+N*j];// ||Aj||^2
            }
            q=qi-qj;
            if((p*p/(qi*qj)<tol)||(qi<tol)||(qj<tol))ntol++; //if Ai & Aj are orthogonal enough...
            else                        //if Ai & Aj are not orthogonal enough then... rotate them
            {
                c=sqrt(4*p*p+q*q);
                if(q>=0){
                    cs=sqrt((c+q)/(2*c));
                    sn=p/(c*cs);
                }
                else{
                    sn=(p>=0)?sqrt((c-q)/2/c):-sqrt((c-q)/2/c);
                    cs=p/(c*sn);
                }
                //...rotate Ai & Aj
                double Ahi,Vhi;
                for(int h=0;h<N;h++)
                {
                    Ahi=A[h+N*i];
                    A[h+N*i]=cs*A[h+N*i]+sn*A[h+N*j];
                    A[h+N*j]=-sn*Ahi+cs*A[h+N*j];
                }
                //store & update rotation matrix V (only columns i & j atre updated)
                for(int h=0;h<N;h++)
                {
                    Vhi=V[h+N*i];
                    V[h+N*i]=cs*V[h+N*i]+sn*V[h+N*j];
                    V[h+N*j]=-sn*Vhi+cs*V[h+N*j];
                }
            }

        }
    }
}

//STEP 2 : PARALLEL BLOCK MUTUAL ORTHOGONALISATION
static int*I=new int[(int)ceil(0.5*(M-1))];
static int*J=new int[(int)ceil(0.5*(M-1))];
for(int h=0;h<M;h++)
{
    //fill in i,j indexes of blocks to be mutually orthogonalized at each turn
    int k=h+1;
    if (k==M)
    {
        for(int i=2;i<=(int)ceil(0.5*M);i++){
            I[i-2]=i-1;
            J[i-2]=M+2-i-1;
        }
    }
    else
    {
        for(int i=1;i<=(int)ceil(0.5*(M-k));i++)I[i-1]=i-1;
        for(int i=1;i<=(int)ceil(0.5*(M-k));i++)J[i-1]=M-k+2-i-1;
        if(k>2)
        {
            int j=(int)ceil(0.5*(M-k));
            for(int i=M-k+2;i<=M-(int)floor(0.5*k);i++){
                I[j]=i-1;
                J[j]=2*M-k+2-i-1;
                j++;
            }
        }

    }
    int ng=(k%2==0)?(int)floor(0.5*(M-1)):(int)floor(0.5*M);

    #pragma omp parallel for schedule(static,5) shared(A,V,I,J,n,tol,N,ng) reduction(+:ntol) default(none)
    for(int g=0;g<ng;g++)
    {
        int block_i=I[g];
        int block_j=J[g];
        for(int i=block_i*n;i<block_i*n+imin(n,N-block_i*n);i++)
        {
            for(int j=block_j*n;j<block_j*n+imin(n,N-block_j*n);j++)
            {
                double p=0;
                double qi=0;
                double qj=0;
                double cs,sn,q,c;
                int h;
                for(h=0;h<N;h++)
                {
                    p+=A[h+N*i]*A[h+N*j];//columns dot product:Ai * Aj
                    qi+=A[h+N*i]*A[h+N*i];// ||Ai||^2
                    qj+=A[h+N*j]*A[h+N*j];// ||Aj||^2
                }
                q=qi-qj;
                if((p*p/(qi*qj)<tol)||(qi<tol)||(qj<tol))ntol++; //if Ai & Aj are orthogonal enough...
                else                        //if Ai & Aj are not orthogonal enough then... rotate them
                {
                    c=sqrt(4*p*p+q*q);
                    if(q>=0){
                        cs=sqrt((c+q)/(2*c));
                        sn=p/(c*cs);
                    }
                    else{
                        sn=(p>=0)?sqrt((c-q)/2/c):-sqrt((c-q)/2/c);
                        cs=p/(c*sn);
                    }
                    //...rotate Ai & Aj
                    double Ahi,Vhi;
                    for(h=0;h<N;h++)
                    {
                        Ahi=A[h+N*i];
                        A[h+N*i]=cs*A[h+N*i]+sn*A[h+N*j];
                        A[h+N*j]=-sn*Ahi+cs*A[h+N*j];
                    }
                    //store & update rotation matrix V (only columns i & j atre updated)
                    for(h=0;h<N;h++)
                    {
                        Vhi=V[h+N*i];
                        V[h+N*i]=cs*V[h+N*i]+sn*V[h+N*j];
                        V[h+N*j]=-sn*Vhi+cs*V[h+N*j];
                    }
                }

            }
        }


    }
}
if(2*ntol==(N*(N-1)))return(1);//if each columns of A is orthogonal enough to each other stop sweep
return(0);
}

int SVD(double* A,double* U,double*V,int N,double tol,double* sigma)
{
/********************************************************************************************
This routine calculates the SVD decomposition of the square matrix A [NxN]

                                A = U * S * V'

Arguments :
A       :   on input NxN square matrix to be factorized, on exit contains the
            rotated matrix A*V=U*S.
V       :   on input an identity NxN matrix, on exit is the right orthogonal matrix
            of the decomposition A = U*S*V'
U       :   NxN matrix, on exit is the left orthogonal matrix of the decomposition A = U*S*V'.
sigma   :   array of dimension N. On exit contains the singular values, i.e. the diagonal
            elements of the matrix S.
N       :   Dimension of the A matrix.
tol     :   Tolerance for the convergence of the orthogonalisation of A. Said Ai and Aj any two
            columns of A, the convergence is attained when Ai*Aj / ( |Ai|*|Aj| ) < tol for each i,j=0,..,N-1 (i!=j)

The software returns the number of sweeps needed for convergence.

NOTE : ALL MATRICES ARE ASSUMED TO HAVE COLUMN MAJOR ORDERING I.E. M(i,j)=M[i+N*j]

Author: Renato Talucci 2015
*************************************************************************************************/

int n=24;//this is the dimension of block submatrices, you shall enter an optimal value for your hardware
int M=N/n+int(((N%n)!=0)?1:0);
int swp=0;//sweeps counter
int converged=0;
while(converged==0) {
    converged=sweep(A,V,N,tol,M,n);
    swp++;
}
#pragma omp parallel for default(none) shared(sigma,A,U,N)
for(int i=0;i<N;i++)
{
    double si=0;
    for(int j=0;j<N;j++) si+=A[j+N*i]*A[j+N*i];
    si=sqrt(si);
    for(int k=0;k<N;k++) U[k+N*i]=A[k+N*i]/si;
    sigma[i]=si;
}

return(swp);
}

注意：如果某些用户在退出时更喜欢保持A不变，则在进入while循环之前计算U = A * V就足够了，而不是将A矩阵传递给sweep（）例程，传递获得的U矩阵。在收敛sweep（）之后，U矩阵应该是正交归一化而不是A，而#pragma omp指令必须在共享变量中包含U而不是A.

注2：如果你（我有）将A（k）矩阵序列分解，其中A（k）可以被认为是先前A（k-1）的扰动，作为雅可比矩阵可以在多步牛顿求解器中考虑，可以很容易地修改驱动程序以从A（k-1）更新为A（k），而不是从开始计算A（k）。这是代码：

int updateSVD(double* DA,double* U,double*V,int N,double tol,double* sigma)
{
/********************************************************************************************
Given a previously factorization 

                        A(k-1) = U(k-1) * S(k-1) * V(k-1)'

and given a perturbation DA(k) of A(k-1), i.e.

                                    A(k) = A(k-1) + DA(k)

this routine calculates the SVD factorization of A(k), starting from the factorization of A(k-1)

Arguments:

DA      :   on input NxN perturbation matrix, unchanged on exit
U       :   on input NxN orthonormal left matrix of the previous (k-1) factorization, on exit 
            orthonormal right matrix of the current factorization
V       :   on input NxN orthonormal right matrix of the previous (k-1) factorization, on exit 
            orthonormal right matrix of the current factorization
N       :   dimension of the matrices
tol     :   Tolerance for the convergence of the orthogonalisation of A. Said Ai and Aj any two
            columns of A, the convergence is attained when Ai*Aj / ( |Ai|*|Aj| ) < tol for each i,j=0,..,N-1 (i!=j)
sigma   :   on input, array with the N singular values of the previuos factorization, on exit 
            array with the N singular values of the current factorization
NOTE : ALL MATRICES ARE ASSUMED TO HAVE COLUMN MAJOR ORDERING I.E. M(i,j)=M[i+N*j]   

Author: Renato Talucci 2015
*************************************************************************************************/


for(int i=0;i<N;i++) for(int j=0;j<N;j++) U[i+N*j]*=sigma[j];
int n=24; //this is the dimension of block submatrices, you shall enter an optimal value for your hardware
matmat_col_col(DA,V,U,N,n); //U =U(k-1)*S(k-1) + DA(k)*V(k-1) = A(k)*V(k-1) 
int M=N/n+int(((N%n)!=0)?1:0);  //number of blocks
int swp=0;//sweeps counter
int converged=0;
while(converged==0) {
    converged=sweep(U,V,N,tol,M,n);
    swp++;
}

#pragma omp parallel for default(none) shared(sigma,U,N)
for(int i=0;i<N;i++)
{
    double si=0;
    for(int j=0;j<N;j++) si+=U[j+N*i]*U[j+N*i];
    si=sqrt(si);
    for(int k=0;k<N;k++) U[k+N*i]=U[k+N*i]/si;
    sigma[i]=si;
}

return(swp);
}

最后，例程matmat_col_col(DA,V,U,N,n)是一个并行块矩阵乘积。这是代码：

inline int imin(int a,int b) {return((a<=b)?a:b);}

void matmat_col_col(double* A,double* B,double*C,int  N,int n)
/********************************************************************************************
square matrix block product NxN :

                            C = C + A * B

n is the optimal block dimension
N is the dimension of the matrices

NOTE : ALL MATRICES ARE ASSUMED TO HAVE COLUMN MAJOR ORDERING M(i,j) = M[i+N*j]

Author: Renato Talucci 2015
*************************************************************************************************/
{
 int M=N/n+int(((N%n)!=0)?1:0);
 #pragma omp parallel for shared(M,A,B,C)
 for(int a=0;a<M;a++)
 {
     for(int b=0;b<M;b++)
     {
         for(int c=0;c<M;c++)
         {
             for(int i=a*n;i<imin((a+1)*n,N);i++)
             {
                 for(int j=b*n;j<imin((b+1)*n,N);j++)
                 {
                     for(int k=c*n;k<imin((c+1)*n,N);k++)
                     {
                          C[i+N*j]+=A[i+N*k]*B[k+N*j];
                     }

                 }

             }
         }
     }
 }
return;
}

我希望不会因为这么多的复制和粘贴而创建错别字。

如果有人能改进代码那就太好了。

Answer 2

奇异值分解（SVD）的FLOPS应为O(N^3)，而读数为O(N^2)。这意味着可以并行化算法，并使其与核心数量很好地扩展。

但是，您对SVD的实现是内存带宽限制，这意味着它不能很好地适应单个套接字系统的内核数量。您当前的三个嵌套循环各自遍历N的整个范围，这导致下次需要重用时，大部分数据将从缓存中逐出。

为了计算绑定，您必须更改算法，以便不使用大小为N的长条/点产品，而是使用循环平铺来最大化{{ 1}}（其中n^3是块的大小）在大小为n的缓存中进行计算。

以下是来自谷歌搜索的paper for parallel SVD computation，摘要中说明了

我们提出的块JRS算法的关键点是通过对矩阵块（b行）执行计算而不是像JRS迭代算法中的矢量条那样，将加载的数据重新用于高速缓冲存储器。

I used loop tiling/blocks to achieve good scaling with the number of cores for cholesky decomposition

请注意，使用tiles / blocks也可以提高单线程代码的性能。

使用openMP的并行SVD分解无法按预期执行

2 个答案: