GSL和MKL之间的速度差异

时间:2017-01-31 02:27:16

标签: c++ openmpi blas gsl intel-mkl

我有两个代码都工作,但我无法弄清楚为什么一个比另一个快得多。据我所知,带有MKL(英特尔)的BLAS应该比GSL(GNU)快得多,尽管我的代码显示完全相反。下面是代码本身,我只是在主节点上创建2个矩阵,然后将不同的行发送到不同的“从”处理器(使用OpenMPI),这些处理器计算最终的矩阵元素,然后将它们返回给主节点。

GSL示例(快速代码):

#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include <gsl/gsl_blas.h> 
using namespace std; 

int main(int argc, char** argv){
    int noprocs, nid;
    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &nid);
    MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
    int master = 0;

    const int nsame = 1000; //must be same if matrices multiplied together = acols = brows
    const int arows = 1000;
    const int bcols = 1000; 
    int rowsent;
    double * buff;
    buff = new double [nsame];
    double * b;
    b = new double [nsame*bcols];

    double** c = new double*[arows];
    for(int i = 0; i < arows; ++i)
        c[i] = new double[bcols];

    double * CC;
    CC = new double [1*bcols]; //here ncols corresponds to numbers of rows for matrix b
    for (int i = 0; i < bcols; i++){
                CC[i] = 0.;
    }; //this is imply a 1-d array of zeros which will be updated and passed by processors
    // Master part
    if (nid == master ) {  

        double** a = new double*[arows];
        for(int i = 0; i < arows; ++i){
            a[i] = new double[nsame];}

        for (int i = 0; i < arows; i++){
            for (int j = 0; j < nsame; j++){
                if (i == j)
                    a[i][j] = 1.;
                else
                    a[i][j] = 0.;
            }
        }
        for (int i = 0; i < (nsame*bcols); i++){
            b[i] = (10.*i + 3.)/(3.*i - 2.) ;
        } 
        MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code
        // send one row to each slave tagged with row number, assume nprocs<nrows
        rowsent=0;
        for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3 
            MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
            rowsent++;
        }

        for (int i=0; i<arows; i++) { 
            MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
                     MPI_COMM_WORLD, &status); 
            int sender = status.MPI_SOURCE;
            int anstype = status.MPI_TAG;            //row number+1
            int IND_I = 0;
            while (IND_I < bcols){
                c[anstype - 1][IND_I] = CC[IND_I]; 
                IND_I++;
            }
            if (rowsent < arows) {
                MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
                rowsent++; 
            }
            else {       // tell sender no more work to do via a 0 TAG
                MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
            }
        }
    }

    // Slave part
    else { 
        MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); 
        MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status); 
        while(status.MPI_TAG != 0) {
            int crow = status.MPI_TAG; 
            gsl_matrix_view AAAA = gsl_matrix_view_array(buff, 1, nsame);
            gsl_matrix_view BBBB = gsl_matrix_view_array(b, nsame, bcols);
            gsl_matrix_view CCCC = gsl_matrix_view_array(CC, 1, bcols);

            /* Compute C = A B */
            gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, &AAAA.matrix, &BBBB.matrix,
                            0.0, &CCCC.matrix); 
            MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
            MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
            //            cout << ans << " OUTPUT \n";
        }
    }

    MPI_Finalize(); 
    return 0;
};

MKL示例(慢速代码):

#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include </opt/intel/compilers_and_libraries_2017.1.126/mac/mkl/include/mkl.h>
using namespace std;  

int main(int argc, char** argv){ //THE IDENTITY MATRIX ONLY WORKS IF arows = nsame!
    int noprocs, nid;
    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &nid);
    MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
    int master = 0;

    const int nsame = 1000;  
    const int arows = 1000;
    const int bcols = 1000;  
    int rowsent;
    double * buff;
    buff = new double [nsame];
    double * b;
    b = new double [nsame*bcols];

    double** c = new double*[arows];
    for(int i = 0; i < arows; ++i)
        c[i] = new double[bcols];

    double * CC;
    CC = new double [1*bcols];  
    for (int i = 0; i < bcols; i++){
        CC[i] = 0.;
    };  
    // Master part
    if (nid == master ) { 

        double** a = new double*[arows];
        for(int i = 0; i < arows; ++i){
            a[i] = new double[nsame];}

        for (int i = 0; i < arows; i++){
            for (int j = 0; j < nsame; j++){
                if (i == j)
                    a[i][j] = 1.;
                else
                    a[i][j] = 0.;
            }
        }
        for (int i = 0; i < (nsame*bcols); i++){
            b[i] = (10.*i + 3.)/(3.*i - 2.) ; // = 1.*i as test value
        } 
        MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code nprocs<nrows 
        delete[] b;
        rowsent=0;
        for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3 
            MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
            delete[] a[rowsent];
            rowsent++; 
        }

        for (int i=0; i<arows; i++) { 
            MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
                     MPI_COMM_WORLD, &status); 
            int sender = status.MPI_SOURCE;
            int anstype = status.MPI_TAG;            //row number+1
            int IND_I = 0;
            while (IND_I < bcols){
                c[anstype - 1][IND_I] = CC[IND_I]; 
                IND_I++;
            } 
            if (rowsent < arows) {
                MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
                delete[] a[rowsent];
                rowsent++; 
            }
            else {       // tell sender no more work to do via a 0 TAG
                MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
            }
        }
    }

    // Slave part
    else { 
        MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); 
        MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status); 
        while(status.MPI_TAG != 0) {
            int crow = status.MPI_TAG; 

            /* Compute C = A B */
            cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, bcols, nsame, 1.0, buff, nsame, b, bcols,
                         0.0, CC, bcols); 

            MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
            MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status); 
        }
    }

    MPI_Finalize(); 
    return 0;
};

我当时认为这可能是因为我没有删除任何创建的新元素,尽管我使用的方法与初始化两个代码中的数组基本相同。我甚至尝试删除MKL代码中的值(如图所示),但这似乎没有太大影响。当我将数组的大小从nsame = arows = bcols = 1000增加到nsame = arows = bcols = 10000时,可以很容易地观察到两个代码的时间差异(GSL代码大约需要45秒,而MKL代码需要几分钟) 。因此,我想知道这是否仅仅是GSL和MKL设计的方式所固有的,并且包含在我的代码中,或者是否还有其他更微妙的东西。

0 个答案:

没有答案