我最近编写了一个基于“单面Jacobi旋转”算法的并行SVD分解程序。代码工作正常,但速度非常慢。
实际上它应该利用内部for循环for(int g=0;g<n;g++)
中的并行性,但是在评论#pragma omp paralell for
指令时,我可以理解只是性能的轻微下降。换句话说,并行上没有明显的加速(代码与4个线程并行运行)。
注1:几乎所有工作都集中在以下三个涉及矩阵A和V的循环中,这些循环相对较大。
for(h=0;h<N;h++)
{
p+=A[h+N*i]*A[h+N*j];//columns dot product:Ai * Aj
qi+=A[h+N*i]*A[h+N*i];// ||Ai||^2
qj+=A[h+N*j]*A[h+N*j];// ||Aj||^2
}
和
double Ahi,Vhi;
for(h=0;h<N;h++)//...rotate Ai & Aj (only columns i & j are changend)
{
Ahi=A[h+N*i];
A[h+N*i]=cs*A[h+N*i]+sn*A[h+N*j];
A[h+N*j]=-sn*Ahi+cs*A[h+N*j];
}
//store & update rotation matrix V (only columns i & j are updated)
for(h=0;h<N;h++)
{
Vhi=V[h+N*i];
V[h+N*i]=cs*V[h+N*i]+sn*V[h+N*j];
V[h+N*j]=-sn*Vhi+cs*V[h+N*j];
}
所有的并行性都应该在那里被利用,但事实并非如此。我无法理解为什么。
注2:在Windows(cygWin编译器)和Linux(GCC)平台上也会发生同样的情况。
注3:矩阵由列主阵列表示
所以我正在寻找一些帮助,找出为什么没有利用并行性。我错过了什么?并列中有一些隐藏的开销,我无法看到?
非常感谢您的任何建议
int sweep(double* A,double*V,int N,double tol)
{
static int*I=new int[(int)ceil(0.5*(N-1))];
static int*J=new int[(int)ceil(0.5*(N-1))];
int ntol=0;
for(int r=0;r<N;r++) //fill in i,j indexes of parallel rotations in vectors I & J
{
int k=r+1;
if (k==N)
{
for(int i=2;i<=(int)ceil(0.5*N);i++){
I[i-2]=i-1;
J[i-2]=N+2-i-1;
}
}
else
{
for(int i=1;i<=(int)ceil(0.5*(N-k));i++)I[i-1]=i-1;
for(int i=1;i<=(int)ceil(0.5*(N-k));i++)J[i-1]=N-k+2-i-1;
if(k>2)
{
int j=(int)ceil(0.5*(N-k));
for(int i=N-k+2;i<=N-(int)floor(0.5*k);i++){
I[j]=i-1;
J[j]=2*N-k+2-i-1;
j++;
}
}
}
int n=(k%2==0)?(int)floor(0.5*(N-1)):(int)floor(0.5*N);
#pragma omp parallel for schedule(dynamic,5) reduction(+:ntol) default(none) shared(std::cout,I,J,A,V,N,n,tol)
for(int g=0;g<n;g++)
{
int i=I[g];
int j=J[g];
double p=0;
double qi=0;
double qj=0;
double cs,sn,q,c;
int h;
for(h=0;h<N;h++)
{
p+=A[h+N*i]*A[h+N*j];//columns dot product:Ai * Aj
qi+=A[h+N*i]*A[h+N*i];// ||Ai||^2
qj+=A[h+N*j]*A[h+N*j];// ||Aj||^2
}
q=qi-qj;
if(p*p/(qi*qj)<tol) ntol++; //if Ai & Aj are orthogonal enough...
else //if Ai & Aj are not orthogonal enough then... rotate them
{
c=sqrt(4*p*p+q*q);
if(q>=0){
cs=sqrt((c+q)/(2*c));
sn=p/(c*cs);
}
else{
sn=(p>=0)?sqrt((c-q)/2/c):-sqrt((c-q)/2/c);
cs=p/(c*sn);
}
//...rotate Ai & Aj (only columns i & j are changend)
double Ahi,Vhi;
for(h=0;h<N;h++)
{
Ahi=A[h+N*i];
A[h+N*i]=cs*A[h+N*i]+sn*A[h+N*j];
A[h+N*j]=-sn*Ahi+cs*A[h+N*j];
}
//store & update rotation matrix V (only columns i & j are updated)
for(h=0;h<N;h++)
{
Vhi=V[h+N*i];
V[h+N*i]=cs*V[h+N*i]+sn*V[h+N*j];
V[h+N*j]=-sn*Vhi+cs*V[h+N*j];
}
}
}
}
if(2*ntol==(N*(N-1)))return(1);//if each columns of A is orthogonal enough to each other stop sweep
return(0);
}
答案 0 :(得分:1)
感谢Z boson的言论,我设法编写了一个性能更好的并行SVD分解。它比原来的运行速度快很多倍,我想通过使用SIMD指令仍可以改进它。 我发布代码,如果有人应该发现它的使用。我执行的所有测试都给出了正确的结果,在任何情况下都不保证其使用。 我真的很抱歉没有时间正确评论代码以便更好地理解。
int sweep(double* A,double*V,int N,double tol,int M, int n)
{
/********************************************************************************************
This routine performs a parallel "sweep" of the SVD factorization algorithm for the matrix A.
It implements a single sided Jacobi rotation algorithm, described by Michael W. Berry,
Dani Mezher, Bernard Philippe, and Ahmed Sameh in "Parallel Algorithms for the Singular Value
Decomposition".
At each sweep the A matrix becomes a little more orthogonal, until each column of A is orthogonal
to each other within a give tolerance. At this point the sweep() routine returns 1 and convergence
is attained.
Arguments:
A : on input the square matrix to be orthogonalized, on exit a more "orthogonal" matrix
V : on input the accumulated rotation matrix, on exit it is updated with the current rotations
N : dimension of the matrices.
tol :tolerance for convergence of orthogonalization. See the explainations for SVD() routine
M : number of blocks (it is calculated from the given block size n)
n : block size (number of columns on each block). It should be an optimal value according to the hardware available
returns : 1 if in the last sweep convergence is attained, 0 if not and one more sweep is necessary
Author : Renato Talucci 2015
*************************************************************************************************/
#include <math.h>
int ntol=0;
//STEP 1 : INTERNAL BLOCK ORTHOGONALISATION
#pragma omp paralell for reduction(+:ntol) shared(A,V,n,tol,N) default(none)
for(int a=0;a<M;a++)
{
for(int i=a*n;i<a*n+imin(n,N-a*n)-1;i++)
{
for(int j=i+1;j<a*n+imin(n,N-a*n);j++)
{
double p=0;
double qi=0;
double qj=0;
double cs,sn,q,c;
for(int h=0;h<N;h++)
{
p+=A[h+N*i]*A[h+N*j];//columns dot product:Ai * Aj
qi+=A[h+N*i]*A[h+N*i];// ||Ai||^2
qj+=A[h+N*j]*A[h+N*j];// ||Aj||^2
}
q=qi-qj;
if((p*p/(qi*qj)<tol)||(qi<tol)||(qj<tol))ntol++; //if Ai & Aj are orthogonal enough...
else //if Ai & Aj are not orthogonal enough then... rotate them
{
c=sqrt(4*p*p+q*q);
if(q>=0){
cs=sqrt((c+q)/(2*c));
sn=p/(c*cs);
}
else{
sn=(p>=0)?sqrt((c-q)/2/c):-sqrt((c-q)/2/c);
cs=p/(c*sn);
}
//...rotate Ai & Aj
double Ahi,Vhi;
for(int h=0;h<N;h++)
{
Ahi=A[h+N*i];
A[h+N*i]=cs*A[h+N*i]+sn*A[h+N*j];
A[h+N*j]=-sn*Ahi+cs*A[h+N*j];
}
//store & update rotation matrix V (only columns i & j atre updated)
for(int h=0;h<N;h++)
{
Vhi=V[h+N*i];
V[h+N*i]=cs*V[h+N*i]+sn*V[h+N*j];
V[h+N*j]=-sn*Vhi+cs*V[h+N*j];
}
}
}
}
}
//STEP 2 : PARALLEL BLOCK MUTUAL ORTHOGONALISATION
static int*I=new int[(int)ceil(0.5*(M-1))];
static int*J=new int[(int)ceil(0.5*(M-1))];
for(int h=0;h<M;h++)
{
//fill in i,j indexes of blocks to be mutually orthogonalized at each turn
int k=h+1;
if (k==M)
{
for(int i=2;i<=(int)ceil(0.5*M);i++){
I[i-2]=i-1;
J[i-2]=M+2-i-1;
}
}
else
{
for(int i=1;i<=(int)ceil(0.5*(M-k));i++)I[i-1]=i-1;
for(int i=1;i<=(int)ceil(0.5*(M-k));i++)J[i-1]=M-k+2-i-1;
if(k>2)
{
int j=(int)ceil(0.5*(M-k));
for(int i=M-k+2;i<=M-(int)floor(0.5*k);i++){
I[j]=i-1;
J[j]=2*M-k+2-i-1;
j++;
}
}
}
int ng=(k%2==0)?(int)floor(0.5*(M-1)):(int)floor(0.5*M);
#pragma omp parallel for schedule(static,5) shared(A,V,I,J,n,tol,N,ng) reduction(+:ntol) default(none)
for(int g=0;g<ng;g++)
{
int block_i=I[g];
int block_j=J[g];
for(int i=block_i*n;i<block_i*n+imin(n,N-block_i*n);i++)
{
for(int j=block_j*n;j<block_j*n+imin(n,N-block_j*n);j++)
{
double p=0;
double qi=0;
double qj=0;
double cs,sn,q,c;
int h;
for(h=0;h<N;h++)
{
p+=A[h+N*i]*A[h+N*j];//columns dot product:Ai * Aj
qi+=A[h+N*i]*A[h+N*i];// ||Ai||^2
qj+=A[h+N*j]*A[h+N*j];// ||Aj||^2
}
q=qi-qj;
if((p*p/(qi*qj)<tol)||(qi<tol)||(qj<tol))ntol++; //if Ai & Aj are orthogonal enough...
else //if Ai & Aj are not orthogonal enough then... rotate them
{
c=sqrt(4*p*p+q*q);
if(q>=0){
cs=sqrt((c+q)/(2*c));
sn=p/(c*cs);
}
else{
sn=(p>=0)?sqrt((c-q)/2/c):-sqrt((c-q)/2/c);
cs=p/(c*sn);
}
//...rotate Ai & Aj
double Ahi,Vhi;
for(h=0;h<N;h++)
{
Ahi=A[h+N*i];
A[h+N*i]=cs*A[h+N*i]+sn*A[h+N*j];
A[h+N*j]=-sn*Ahi+cs*A[h+N*j];
}
//store & update rotation matrix V (only columns i & j atre updated)
for(h=0;h<N;h++)
{
Vhi=V[h+N*i];
V[h+N*i]=cs*V[h+N*i]+sn*V[h+N*j];
V[h+N*j]=-sn*Vhi+cs*V[h+N*j];
}
}
}
}
}
}
if(2*ntol==(N*(N-1)))return(1);//if each columns of A is orthogonal enough to each other stop sweep
return(0);
}
int SVD(double* A,double* U,double*V,int N,double tol,double* sigma)
{
/********************************************************************************************
This routine calculates the SVD decomposition of the square matrix A [NxN]
A = U * S * V'
Arguments :
A : on input NxN square matrix to be factorized, on exit contains the
rotated matrix A*V=U*S.
V : on input an identity NxN matrix, on exit is the right orthogonal matrix
of the decomposition A = U*S*V'
U : NxN matrix, on exit is the left orthogonal matrix of the decomposition A = U*S*V'.
sigma : array of dimension N. On exit contains the singular values, i.e. the diagonal
elements of the matrix S.
N : Dimension of the A matrix.
tol : Tolerance for the convergence of the orthogonalisation of A. Said Ai and Aj any two
columns of A, the convergence is attained when Ai*Aj / ( |Ai|*|Aj| ) < tol for each i,j=0,..,N-1 (i!=j)
The software returns the number of sweeps needed for convergence.
NOTE : ALL MATRICES ARE ASSUMED TO HAVE COLUMN MAJOR ORDERING I.E. M(i,j)=M[i+N*j]
Author: Renato Talucci 2015
*************************************************************************************************/
int n=24;//this is the dimension of block submatrices, you shall enter an optimal value for your hardware
int M=N/n+int(((N%n)!=0)?1:0);
int swp=0;//sweeps counter
int converged=0;
while(converged==0) {
converged=sweep(A,V,N,tol,M,n);
swp++;
}
#pragma omp parallel for default(none) shared(sigma,A,U,N)
for(int i=0;i<N;i++)
{
double si=0;
for(int j=0;j<N;j++) si+=A[j+N*i]*A[j+N*i];
si=sqrt(si);
for(int k=0;k<N;k++) U[k+N*i]=A[k+N*i]/si;
sigma[i]=si;
}
return(swp);
}
注意:如果某些用户在退出时更喜欢保持A不变,则在进入while循环之前计算U = A * V就足够了,而不是将A矩阵传递给sweep()例程,传递获得的U矩阵。在收敛sweep()之后,U矩阵应该是正交归一化而不是A,而#pragma omp指令必须在共享变量中包含U而不是A.
注2:如果你(我有)将A(k)矩阵序列分解,其中A(k)可以被认为是先前A(k-1)的扰动,作为雅可比矩阵可以在多步牛顿求解器中考虑,可以很容易地修改驱动程序以从A(k-1)更新为A(k),而不是从开始计算A(k)。这是代码:
int updateSVD(double* DA,double* U,double*V,int N,double tol,double* sigma)
{
/********************************************************************************************
Given a previously factorization
A(k-1) = U(k-1) * S(k-1) * V(k-1)'
and given a perturbation DA(k) of A(k-1), i.e.
A(k) = A(k-1) + DA(k)
this routine calculates the SVD factorization of A(k), starting from the factorization of A(k-1)
Arguments:
DA : on input NxN perturbation matrix, unchanged on exit
U : on input NxN orthonormal left matrix of the previous (k-1) factorization, on exit
orthonormal right matrix of the current factorization
V : on input NxN orthonormal right matrix of the previous (k-1) factorization, on exit
orthonormal right matrix of the current factorization
N : dimension of the matrices
tol : Tolerance for the convergence of the orthogonalisation of A. Said Ai and Aj any two
columns of A, the convergence is attained when Ai*Aj / ( |Ai|*|Aj| ) < tol for each i,j=0,..,N-1 (i!=j)
sigma : on input, array with the N singular values of the previuos factorization, on exit
array with the N singular values of the current factorization
NOTE : ALL MATRICES ARE ASSUMED TO HAVE COLUMN MAJOR ORDERING I.E. M(i,j)=M[i+N*j]
Author: Renato Talucci 2015
*************************************************************************************************/
for(int i=0;i<N;i++) for(int j=0;j<N;j++) U[i+N*j]*=sigma[j];
int n=24; //this is the dimension of block submatrices, you shall enter an optimal value for your hardware
matmat_col_col(DA,V,U,N,n); //U =U(k-1)*S(k-1) + DA(k)*V(k-1) = A(k)*V(k-1)
int M=N/n+int(((N%n)!=0)?1:0); //number of blocks
int swp=0;//sweeps counter
int converged=0;
while(converged==0) {
converged=sweep(U,V,N,tol,M,n);
swp++;
}
#pragma omp parallel for default(none) shared(sigma,U,N)
for(int i=0;i<N;i++)
{
double si=0;
for(int j=0;j<N;j++) si+=U[j+N*i]*U[j+N*i];
si=sqrt(si);
for(int k=0;k<N;k++) U[k+N*i]=U[k+N*i]/si;
sigma[i]=si;
}
return(swp);
}
最后,例程matmat_col_col(DA,V,U,N,n)
是一个并行块矩阵乘积。这是代码:
inline int imin(int a,int b) {return((a<=b)?a:b);}
void matmat_col_col(double* A,double* B,double*C,int N,int n)
/********************************************************************************************
square matrix block product NxN :
C = C + A * B
n is the optimal block dimension
N is the dimension of the matrices
NOTE : ALL MATRICES ARE ASSUMED TO HAVE COLUMN MAJOR ORDERING M(i,j) = M[i+N*j]
Author: Renato Talucci 2015
*************************************************************************************************/
{
int M=N/n+int(((N%n)!=0)?1:0);
#pragma omp parallel for shared(M,A,B,C)
for(int a=0;a<M;a++)
{
for(int b=0;b<M;b++)
{
for(int c=0;c<M;c++)
{
for(int i=a*n;i<imin((a+1)*n,N);i++)
{
for(int j=b*n;j<imin((b+1)*n,N);j++)
{
for(int k=c*n;k<imin((c+1)*n,N);k++)
{
C[i+N*j]+=A[i+N*k]*B[k+N*j];
}
}
}
}
}
}
return;
}
我希望不会因为这么多的复制和粘贴而创建错别字。
如果有人能改进代码那就太好了。
答案 1 :(得分:0)
奇异值分解(SVD)的FLOPS应为O(N^3)
,而读数为O(N^2)
。这意味着可以并行化算法,并使其与核心数量很好地扩展。
但是,您对SVD的实现是内存带宽限制,这意味着它不能很好地适应单个套接字系统的内核数量。您当前的三个嵌套循环各自遍历N
的整个范围,这导致下次需要重用时,大部分数据将从缓存中逐出。
为了计算绑定,您必须更改算法,以便不使用大小为N
的长条/点产品,而是使用循环平铺来最大化{{ 1}}(其中n^3
是块的大小)在大小为n
的缓存中进行计算。
以下是来自谷歌搜索的paper for parallel SVD computation,摘要中说明了
我们提出的块JRS算法的关键点是通过对矩阵块(b行)执行计算而不是像JRS迭代算法中的矢量条那样,将加载的数据重新用于高速缓冲存储器。
请注意,使用tiles / blocks也可以提高单线程代码的性能。