我试图用OpenMP 3.1(gcc 4.7.2)实现并行朴素矩阵乘法算法。为了保护结果矩阵的更新单元格,我使用了#pragma omp atomic update
,但结果运行时间大约是顺序朴素算法运行时间的6倍。
#include <omp.h>//#pragma
#include <cstddef>//size_t
#include <cstring>//memset
#include <ctime>//clock
#include <iostream>//cout
using namespace std;
template<class T>
class Matrix_const_size
{
typedef std::size_t size_type;
size_type h, w;
T *matrix;
Matrix_const_size(const Matrix_const_size&);
public:
Matrix_const_size(size_type height, size_type width)
: h(height), w(width), matrix(new T[h * w]){memset(matrix,0,sizeof(T)*(h*w));}
~Matrix_const_size() {delete []matrix;}
size_type height() const {return h;}
size_type width() const {return w;}
void clear() {memset(matrix,0,sizeof(T)*(h*w));}
T& operator()(size_type i,size_type j) {return matrix[i*w+j];}
const T& operator()(size_type i,size_type j) const {return matrix[i*w+j];}
};
template<typename T>
void parallel_naive_product(const Matrix_const_size<T> &m1, const Matrix_const_size<T> &m2, Matrix_const_size<T> &m3)
{
size_t h1,w1,w2;
h1=m1.height();
w1=m1.width();
w2=m2.width();
#pragma omp parallel for default(none) shared(m1,m2,m3,h1,w1,w2) schedule(static)
for (size_t row=0;row<h1;++row){
for (size_t col=0;col<w2;++col){
for (size_t element=0;element<w1;++element){
T tmp=m1(row,element)*m2(element,col);
#pragma omp atomic update//Achilles' heel
m3(row,col)+=tmp;
};
};
};
}
template<typename T>
void naive_product(const Matrix_const_size<T> &m1, const Matrix_const_size<T> &m2, Matrix_const_size<T> &m3)
{
size_t h1,w1,w2;
h1=m1.height();
w1=m1.width();
w2=m2.width();
for (size_t row=0;row<h1;++row){
for (size_t col=0;col<w2;++col){
for (size_t element=0;element<w1;++element){
T tmp=m1(row,element)*m2(element,col);
// #pragma omp atomic update//It should not be here. It's just for an experiment.
m3(row,col)+=tmp;
};
};
};
}
int main()
{
size_t height1=1000;
size_t width1=800;
Matrix_const_size<int> m1(height1,width1);
int tmp=0;
for (size_t i=0;i<m1.height();++i){
for (size_t j=0;j<m1.width();++j){
m1(i,j)=++tmp;
};
};
size_t height2=800;
size_t width2=1000;
Matrix_const_size<int> m2(height2,width2);
tmp=0;
for (size_t i=0;i<m2.height();++i){
for (size_t j=0;j<m2.width();++j){
m2(i,j)=++tmp;
};
};
Matrix_const_size<int> result(height1,width2);
clock_t t1,t2;
//Naive algorithm (inner product)
t1=clock();
naive_product(m1,m2,result);
t1=clock()-t1;
result.clear();
//Parallel naive algorithm(inner product)
t2=clock();
parallel_naive_product(m1,m2,result);
t2=clock()-t2;
cout<<"Naive algorithm(inner product) running time="<<t1<<endl;
cout<<"Parallel naive algorithm(inner product) running time="<<t2<<endl;
return 0;
}
真正奇怪的是,当我将#pragma omp atomic update
添加到顺序朴素算法时,它的运行时间增加了大约3倍(尽管事实上它根本不应该阻塞,因为它只在一个线程上运行)
我的问题是:我可以做些什么来提高效率,但确保结果总是正确的?
执行程序需要几秒钟。为了减少执行时间,可以减少矩阵大小。