我正在研究一些使用openmp来减少矩阵的代码。我有两个版本,都使我的Ubuntu和Fedora安装崩溃。很难我的意思是我的鼠标和键盘没有响应,即使我点击我的PC塔上的重置按钮它也不会重新启动。我必须按住电源按钮。有些奇怪的是,代码在运行几分钟后崩溃了。它不消耗大量内存(我认为750 mb很小,因为我有16GB的内存)。
#include <iostream>
#include <cstddef>
#include <cstring>
#include <iomanip>
#include <cstdlib>
#include <ctime>
#include <cmath>
using namespace std;
class Matrix
{
public:
Matrix(size_t rows, size_t cols):
data(0), w(rows), h(cols)
{
data = new double[w * h];
memset(data, 0, sizeof(double) * w * h);
}
~Matrix()
{
if(data)
{
delete[] data;
w = h = 0;
data = 0;
}
}
double* operator[](size_t row)
{
return data + row * w;
}
const double* operator[](size_t row) const
{
return data + row * w;
}
size_t width() const
{
return w;
}
size_t height() const
{
return h;
}
void scale_row(size_t row, double x)
{
double* prow = (*this)[row];
for(size_t i = 0; i < w; i++)
prow[i] *= x;
}
void add_row(size_t dest_row, size_t source_row, double scaling = 1.0)
{
if(dest_row == source_row)
{
scale_row(dest_row, 1.0 + scaling);
return;
}
double* __restrict__ drow = (*this)[dest_row];
double* __restrict__ srow = (*this)[source_row];
for(size_t i = 0; i < w; i++)
drow[i] += srow[i] * scaling;
}
void swap_rows(size_t r1, size_t r2)
{
if(r1 == r2)
return;
double* __restrict__ a = (*this)[r1];
double* __restrict__ b = (*this)[r2];
#pragma omp parallel for simd
for(size_t i = 0; i < w; i++)
{
double tmp = a[i];
a[i] = b[i];
b[i] = tmp;
}
}
double* find_leading(size_t row)
{
double* ptr = (*this)[row];
for(size_t i = 0; i < w; i++)
if(ptr[i])
return ptr + i;
return 0;
}
void clamp_zeros(double threshold = 1e-12)
{
#pragma omp parallel for simd
for(size_t i = 0; i < w * h; i++)
{
if(fabs(data[i]) < threshold)
data[i] = 0;
}
}
void row_reduce(Matrix* mirror = 0)
{
for(size_t r1 = 0; r1 < h; r1++)
{
double* lead = find_leading(r1);
if(!lead)
continue;
size_t rank = lead - (*this)[r1];
if(mirror)
mirror->scale_row(r1, 1.0 / *lead);
scale_row(r1, 1.0 / *lead);
#pragma omp parallel for
for(size_t r2 = 0; r2 < h; r2++)
{
if(r2 == r1 || (*this)[r2][rank] == 0)
continue;
if(mirror)
mirror->add_row(r2, r1, -(*this)[r2][rank]);
add_row(r2, r1, -(*this)[r2][rank]);
}
clamp_zeros();
}
size_t zero_count = 0;
for(size_t r = 0; r < h; r++)
{
double* lead = find_leading(r);
if(lead)
{
size_t rank = lead - (*this)[r];
swap_rows(rank, r);
if(mirror)
mirror->swap_rows(rank, r);
}
else
{
size_t with = h - ++zero_count;
swap_rows(r, with);
if(mirror)
mirror->swap_rows(r, with);
}
}
}
private:
double* data;
size_t w, h;
};
ostream& operator<<(ostream& o, const Matrix& m)
{
o << setprecision(2);
for(size_t j = 0; j < m.width(); j++)
{
o << "----------";
}
o << "--\n";
for(size_t i = 0; i < m.height(); i++)
{
o << "|";
for(size_t j = 0; j < m.width(); j++)
{
o << setw(10) << m[i][j];
}
o << "|\n";
}
for(size_t j = 0; j < m.width(); j++)
{
o << "----------";
}
o << "--";
return o;
}
int main()
{
srand(time(0));
Matrix m (10000, 10000);
for(int i = 0; i < m.height(); i++)
{
for(int j = 0; j < m.width(); j++)
{
m[i][j] = rand() % 100;
}
}
time_t start = time(0);
m.row_reduce();
time_t end = time(0);
cout << m[0][2] << endl;
cout << "dt = " << (end - start) << endl;
return 0;
}
我还尝试了另一种愚蠢的简单omp程序,看看它是否会导致我的系统崩溃而这个程序没有。
double sum = 0.0;
double start = omp_get_wtime();
#pragma omp parallel for reduction(+:sum)
for(long long i = 1; i < 100000000000000LL; i++)
{
sum += 1.0 / ((double)i * i);
}
printf("%lf %lf\n", omp_get_wtime() - start, sum);
当我在使用gcc 4.9编译的Ubuntu 15.04和使用gcc 5.1编译的Fedora 22上运行它时,我尝试了第一个并遇到了同样的问题。
当我在没有openmp的情况下运行它时工作正常。此外,如果我尝试像2000x2000矩阵这样的较小数据,它工作正常(当我尝试10,000x10,000矩阵时会发生崩溃)。
似乎在我的笔记本电脑上运行正常,它也在运行ubuntu 15.04。
答案 0 :(得分:1)
我做了一些代码更改以支持与OpenMP 2.0的兼容性,我可以告诉你,你的代码运行良好(Windows 7,Visual Studio 2008)。内存消耗约800MB。
输出:
0
dt = 2881
这是您修改后的代码。
////////////////////////////////////////////////////////////////
// OpenMP test function
#include <iostream>
#include <cstddef>
#include <cstring>
#include <iomanip>
#include <cstdlib>
#include <ctime>
#include <cmath>
#include <omp.h>
using namespace std;
class Matrix
{
public:
Matrix(size_t rows, size_t cols):
data(0), w(rows), h(cols)
{
data = new double[w * h];
memset(data, 0, sizeof(double) * w * h);
}
~Matrix()
{
if(data)
{
delete[] data;
w = h = 0;
data = 0;
}
}
double* operator[](size_t row)
{
return data + row * w;
}
const double* operator[](size_t row) const
{
return data + row * w;
}
size_t width() const
{
return w;
}
size_t height() const
{
return h;
}
void scale_row(size_t row, double x)
{
double* prow = (*this)[row];
for(size_t i = 0; i < w; i++)
prow[i] *= x;
}
void add_row(size_t dest_row, size_t source_row, double scaling = 1.0)
{
if(dest_row == source_row)
{
scale_row(dest_row, 1.0 + scaling);
return;
}
double* drow = (*this)[dest_row];
double* srow = (*this)[source_row];
for(size_t i = 0; i < w; i++)
drow[i] += srow[i] * scaling;
}
void swap_rows(size_t r1, size_t r2)
{
if(r1 == r2)
return;
double* a = (*this)[r1];
double* b = (*this)[r2];
#pragma omp parallel for schedule(dynamic)
for(int i = 0; i < w; i++)
{
double tmp = a[i];
a[i] = b[i];
b[i] = tmp;
}
}
double* find_leading(size_t row)
{
double* ptr = (*this)[row];
for(int i = 0; i < w; i++)
if(ptr[i])
return ptr + i;
return 0;
}
void clamp_zeros(double threshold = 1e-12)
{
#pragma omp parallel for schedule(dynamic)
for(int i = 0; i < w * h; i++)
{
if(fabs(data[i]) < threshold)
data[i] = 0;
}
}
void row_reduce(Matrix* mirror = 0)
{
for(size_t r1 = 0; r1 < h; r1++)
{
double* lead = find_leading(r1);
if(!lead)
continue;
size_t rank = lead - (*this)[r1];
if(mirror)
mirror->scale_row(r1, 1.0 / *lead);
scale_row(r1, 1.0 / *lead);
#pragma omp parallel for schedule(dynamic)
for(int r2 = 0; r2 < h; r2++)
{
if(r2 == r1 || (*this)[r2][rank] == 0)
continue;
if(mirror)
mirror->add_row(r2, r1, -(*this)[r2][rank]);
add_row(r2, r1, -(*this)[r2][rank]);
}
clamp_zeros();
}
size_t zero_count = 0;
for(size_t r = 0; r < h; r++)
{
double* lead = find_leading(r);
if(lead)
{
size_t rank = lead - (*this)[r];
swap_rows(rank, r);
if(mirror)
mirror->swap_rows(rank, r);
}
else
{
size_t with = h - ++zero_count;
swap_rows(r, with);
if(mirror)
mirror->swap_rows(r, with);
}
}
}
private:
double* data;
size_t w, h;
};
ostream& operator<<(ostream& o, const Matrix& m)
{
o << setprecision(2);
for(size_t j = 0; j < m.width(); j++)
{
o << "----------";
}
o << "--\n";
for(size_t i = 0; i < m.height(); i++)
{
o << "|";
for(size_t j = 0; j < m.width(); j++)
{
o << setw(10) << m[i][j];
}
o << "|\n";
}
for(size_t j = 0; j < m.width(); j++)
{
o << "----------";
}
o << "--";
return o;
}
int main()
{
int iMaxThreads = omp_get_max_threads();
omp_set_num_threads(iMaxThreads);
omp_set_dynamic(false);
omp_set_nested(true);
srand(time(0));
Matrix m (10000, 10000);
for(int i = 0; i < m.height(); i++)
{
for(int j = 0; j < m.width(); j++)
{
m[i][j] = rand() % 100;
}
}
time_t start = time(0);
m.row_reduce();
time_t end = time(0);
cout << m[0][2] << endl;
cout << "dt = " << (end - start) << endl;
return 0;
}
答案 1 :(得分:1)
我使用GCC 4.9.2在Linux 3.19.0-26-generic#28-Ubuntu 64位上测试了你的代码。
如下图所示,它使用了一些RAM,但我还没有崩溃,时钟上有11分钟的CPU时间,RAM保持稳定。
答案 2 :(得分:1)
事实证明,它根本不是一个编程问题。我的程序在我的笔记本电脑和其他人系统上运行得很好。我刚刚运行了y cruncher 25亿个pi基准测试数字。这导致我的计算机以完全相同的方式崩溃。在此之后,我试用了y cruncher的windows版本。大约30秒后它会引起蓝屏。我想这是一个硬件问题,它发生在内存或cpu被推了一段时间之后。现在我有借口升级到skylake cpu。
更新: 我设法修复它。前一阵子,我在华硕主板上的EZ XMP开关上轻弹了一下。这意味着自动超频内存。之前我在主板上的时钟设置上尝试了cpu,它们总是使我的系统不稳定。然而记忆似乎工作,所以我把它留下并忘了它。我想这并非如此,这导致了我的崩溃。现在我把它关闭了,我的两个cruncher和openmp都可以运行完成。