Question

我试图用OpenMP 3.1（gcc 4.7.2）实现并行朴素矩阵乘法算法。为了保护结果矩阵的更新单元格，我使用了#pragma omp atomic update，但结果运行时间大约是顺序朴素算法运行时间的6倍。

#include <omp.h>//#pragma
#include <cstddef>//size_t
#include <cstring>//memset
#include <ctime>//clock
#include <iostream>//cout

using namespace std;

template<class T>
class Matrix_const_size
{
    typedef std::size_t size_type;
    size_type h, w;
    T *matrix;
    Matrix_const_size(const Matrix_const_size&);
public:
    Matrix_const_size(size_type height, size_type width)
        : h(height), w(width), matrix(new T[h * w]){memset(matrix,0,sizeof(T)*(h*w));}
    ~Matrix_const_size() {delete []matrix;}
    size_type height() const {return h;}
    size_type width() const {return w;}
    void clear() {memset(matrix,0,sizeof(T)*(h*w));}
    T& operator()(size_type i,size_type j) {return matrix[i*w+j];}
    const T& operator()(size_type i,size_type j) const {return matrix[i*w+j];}
};

template<typename T>
void parallel_naive_product(const Matrix_const_size<T> &m1, const Matrix_const_size<T> &m2, Matrix_const_size<T> &m3)
{
    size_t h1,w1,w2;
    h1=m1.height();
    w1=m1.width();
    w2=m2.width();
#pragma omp parallel for default(none) shared(m1,m2,m3,h1,w1,w2) schedule(static)
    for (size_t row=0;row<h1;++row){
        for (size_t col=0;col<w2;++col){
            for (size_t element=0;element<w1;++element){
                T tmp=m1(row,element)*m2(element,col);
                #pragma omp atomic update//Achilles' heel
                m3(row,col)+=tmp;
            };
        };
    };
}

template<typename T>
void naive_product(const Matrix_const_size<T> &m1, const Matrix_const_size<T> &m2, Matrix_const_size<T> &m3)
{
    size_t h1,w1,w2;
    h1=m1.height();
    w1=m1.width();
    w2=m2.width();
    for (size_t row=0;row<h1;++row){
        for (size_t col=0;col<w2;++col){
            for (size_t element=0;element<w1;++element){
                T tmp=m1(row,element)*m2(element,col);
//                #pragma omp atomic update//It should not be here. It's just for an experiment.
                m3(row,col)+=tmp;
            };
        };
    };
}

int main()
{
    size_t height1=1000;
    size_t width1=800;
    Matrix_const_size<int> m1(height1,width1);
    int tmp=0;
    for (size_t i=0;i<m1.height();++i){
        for (size_t j=0;j<m1.width();++j){
            m1(i,j)=++tmp;
        };
    };
    size_t height2=800;
    size_t width2=1000;
    Matrix_const_size<int> m2(height2,width2);
    tmp=0;
    for (size_t i=0;i<m2.height();++i){
        for (size_t j=0;j<m2.width();++j){
            m2(i,j)=++tmp;
        };
    };
    Matrix_const_size<int> result(height1,width2);

    clock_t t1,t2;
    //Naive algorithm (inner product)
    t1=clock();
    naive_product(m1,m2,result);
    t1=clock()-t1;

    result.clear();
    //Parallel naive algorithm(inner product)
    t2=clock();
    parallel_naive_product(m1,m2,result);
    t2=clock()-t2;

    cout<<"Naive algorithm(inner product) running time="<<t1<<endl;
    cout<<"Parallel naive algorithm(inner product) running time="<<t2<<endl;
    return 0;
}

真正奇怪的是，当我将#pragma omp atomic update添加到顺序朴素算法时，它的运行时间增加了大约3倍（尽管事实上它根本不应该阻塞，因为它只在一个线程上运行）

我的问题是：我可以做些什么来提高效率，但确保结果总是正确的？

执行程序需要几秒钟。为了减少执行时间，可以减少矩阵大小。

如何使OpenMP中的并行矩阵乘法安全高效？

0 个答案: