我刚开始使用OpenMP在C ++中进行并行计算。该程序具有糟糕的并行性能。由于我不了解许多多线程分析工具(与单线程的简单gprof不同),我编写了一个示例程序来测试性能。
我有一个2D矩阵(N * N),每个元素都是一个3d矢量(x,y,z)。我只是做一个双循环来设置矩阵中的每个值:
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorStack[i][j] = VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
其中VECTOR3D
是一个简单的类具有x, y, z
属性:
class VECTOR3D {
double x, y, z; // component along each axis
}
另一方面,我也可以使用(N * N * 3)3D数组来做到这一点:
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
arrayHeap[i][j][0] = (1.0*i*i);
arrayHeap[i][j][1] = (1.0*j*j);
arrayHeap[i][j][2] = (1.0*i*j);
}
}
从内存方面来看,还有几种不同的选择,例如使用手动分配和解除分配的原始指针:
double ***arrayHeap;
arrayHeap = new double** [N];
for(int i = 0; i < N; ++i) {
arrayHeap[i] = new double* [N];
for(int j = 0; j < N; ++j) {
arrayHeap[i][j] = new double[3];
}
}
或只是使用std::vector
:
vector< vector<VECTOR3D>> vectorStack(N, vector<VECTOR3D>(N, VECTOR3D(0, 0, 0)));
我还考虑过手动为数组分配连续内存,就像LAMMP(Molecular Simulation Package源代码一样。
所以N=10000
的结果列在这里:
对于单线程:
OMP_NUM_THREADS = 1 ./a.out
为堆上的数组分配内存...
=======堆上的数组结果=======
在时间内完成(总计): 0.720385 秒
在时间内完成(真实): 0.720463 秒
为堆上的数组释放内存...
为阵列连续分配内存......
=======数组连续结果=======
在时间内完成(总计): 0.819733 秒
在时间内完成(真实): 0.819774 秒
为阵列连续释放内存......
在堆上为矢量分配内存...
=======堆上的矢量结果=======
在时间内完成(总计): 3.08715 秒
在时间内完成(真实): 3.08725 秒
为堆上的向量释放内存...
在堆栈上为矢量分配内存...
=======矢量叠加结果=======
在时间内完成(总计): 1.49888 秒
在时间内完成(真实): 1.49895 秒
对于多线程(threads = 4):
OMP_NUM_THREADS = 4 ./a.out
为堆上的数组分配内存...
=======堆上的数组结果=======
在时间内完成(总计): 2.29184 秒
在时间内完成(真实): 0.577807 秒
为堆上的数组释放内存...
为阵列连续分配内存......
=======数组连续结果=======
在时间内完成(总计): 1.79501 秒
在时间内完成(真实): 0.454139 秒
为阵列连续释放内存......
在堆上为矢量分配内存...
=======堆上的矢量结果=======
在时间内完成(总计): 6.80917 秒
在时间内完成(真实): 1.92541 秒
为堆上的向量释放内存...
在堆栈上为矢量分配内存...
=======矢量叠加结果=======
在时间内完成(总计): 1.64086 秒
在时间内完成(真实): 0.411 秒
整体并行效率不高。出乎意料的是,花哨的连续内存分配是没有用的?!为什么会这样? std::vector
似乎对这种情况足够好了吗?
有人可以为我解释结果吗?我还需要有关如何改进代码的建议。
非常感谢!!!
附上所有源代码。 (请直接转到main,开始时手动管理内存有几个功能。)
#include <iostream>
#include <omp.h>
#include <vector>
#include <stdlib.h>
#include <cinttypes>
#include "vector3d.h"
typedef int64_t bigint;
void *smalloc(bigint nbytes, const char *name)
{
if (nbytes == 0) return NULL;
void *ptr = malloc(nbytes);
return ptr;
}
template <typename TYPE>
TYPE ***create(TYPE ***&array, int n1, int n2, int n3, const char *name)
{
bigint nbytes = ((bigint) sizeof(TYPE)) * n1*n2*n3;
TYPE *data = (TYPE *) smalloc(nbytes,name);
nbytes = ((bigint) sizeof(TYPE *)) * n1*n2;
TYPE **plane = (TYPE **) smalloc(nbytes,name);
nbytes = ((bigint) sizeof(TYPE **)) * n1;
array = (TYPE ***) smalloc(nbytes,name);
int i,j;
bigint m;
bigint n = 0;
for (i = 0; i < n1; i++) {
m = ((bigint) i) * n2;
array[i] = &plane[m];
for (j = 0; j < n2; j++) {
plane[m+j] = &data[n];
n += n3;
}
}
return array;
}
template <typename TYPE>
TYPE ***create3d_offset(TYPE ***&array, int n1lo, int n1hi,
int n2, int n3, const char *name)
{
int n1 = n1hi - n1lo + 1;
create(array,n1,n2,n3,name);
array -= n1lo;
return array;
}
void sfree(void *ptr) {
if (ptr == NULL) return;
free(ptr);
}
template <typename TYPE>
void destroy(TYPE ***&array)
{
if (array == NULL) return;
sfree(array[0][0]);
sfree(array[0]);
sfree(array);
array = NULL;
}
template <typename TYPE>
void destroy3d_offset(TYPE ***&array, int offset)
{
if (array == NULL) return;
sfree(&array[offset][0][0]);
sfree(&array[offset][0]);
sfree(&array[offset]);
array = NULL;
}
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
int main() {
using namespace std;
const int N = 10000;
///////////////////////////////////////
double sum = 0.0;
clock_t t;
double startTime, stopTime, secsElapsed;
printf("\n\nAllocating memory for array on heap...\n");
double ***arrayHeap;
arrayHeap = new double** [N];
for(int i = 0; i < N; ++i) {
arrayHeap[i] = new double* [N];
for(int j = 0; j < N; ++j) {
arrayHeap[i][j] = new double[3];
}
}
printf("======= Array on heap Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
arrayHeap[i][j][0] = (1.0*i*i);
arrayHeap[i][j][1] = (1.0*j*j);
arrayHeap[i][j][2] = (1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for array on heap...\n");
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
delete [] arrayHeap[i][j];
}
delete [] arrayHeap[i];
}
delete [] arrayHeap;
///////////////////////////////////////
printf("\n\nAllocating memory for array continous...\n");
double ***array_continuous;
create3d_offset(array_continuous,0, N, N, 3, "array");
printf("======= Array continuous Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
array_continuous[i][j][0] = (1.0*i*i);
array_continuous[i][j][1] = (1.0*j*j);
array_continuous[i][j][2] = (1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for array continuous...\n");
destroy3d_offset(array_continuous, 0);
///////////////////////////////////////k
printf("\n\nAllocating memory for vector on heap...\n");
VECTOR3D ***vectorHeap;
vectorHeap = new VECTOR3D**[N];
for(int i = 0; i < N; ++i) {
vectorHeap[i] = new VECTOR3D* [N];
for(int j = 0; j < N; ++j) {
vectorHeap[i][j] = new VECTOR3D(0,0,0);
}
}
printf("======= Vector on heap Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorHeap[i][j] = new VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for vector on heap...\n");
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
delete [] vectorHeap[i][j];
}
delete [] vectorHeap[i];
}
delete [] vectorHeap;
/////////////////////////////////////////////////
printf("\n\nAllocating memory for vector on stack...\n");
vector< vector<VECTOR3D>> vectorStack(N, vector<VECTOR3D>(N, VECTOR3D(0, 0, 0)));
printf("======= Vector on stack Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorStack[i][j] = VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
/////////////////////////////////
return 0;
}
VECTOR3D
类:
#ifndef _VECTOR3D_H
#define _VECTOR3D_H
#include <iostream>
#include <cmath>
#include <iomanip>
#include <limits>
class VECTOR3D {
public:
double x, y, z; // component along each axis (cartesian)
VECTOR3D(double xx = 0.0, double yy = 0.0, double zz = 0.0) : x(xx), y(yy), z(zz) // make a 3d vector
{
}
}
答案 0 :(得分:1)
您的普通循环不是计算绑定,而是完全内存绑定:您只访问每个元素一次。不重用意味着您无法有效地使用缓存。因此,您不能指望加速等于使用的线程/核心数。实际加速取决于特定系统(内存带宽)。
您的所有数据结构(包括花哨的连续内存)都会对数据访问执行许多间接操作。这不是绝对必要的。为了充分利用连续存储器,你应该简单地将你的二维阵列布局:
template<class T>
class Array2d
{
public:
Array2d(size_t rows, size_t columns) : rows_(rows), columns_(columns), data_(rows_ * columns_) {}
T& operator()(size_t r, size_t c)
{
return data_[r * columns_ + c];
}
const T& operator()(size_t r, size_t c) const
{
return data_[r * columns_ + c];
}
private:
size_t rows_;
size_t columns_;
std::vector<T> data_;
};
注意:如果你真的必须保留operator[]
索引,你也可以制作一个花哨的operator[]
来返回一个提供另一个[i][j]
的代理对象。
如果您受内存带宽限制且N足够大,则间接或平面布局之间不会有明显的性能差异。