#include <iostream>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <omp.h>
std::vector<int> col_sums(const std::vector<std::vector<short>>& data) {
unsigned int height = data.size(), width = data[0].size();
std::vector<int> totalSums(width, 0), threadSums(width, 0);
#pragma omp parallel firstprivate(threadSums)
{
#pragma omp parallel for
for (unsigned int i = 0; i < height; i++) {
threadSums.data()[0:width] += data[i].data()[0:width];
}
#pragma omp critical
{
totalSums.data()[0:width] += threadSums.data()[0:width];
}
}
return totalSums;
}
int main(int argc, char** argv) {
if (argc < 3) {
std::cout << "Run program as \"executable <rows> <columns>\n";
} else {
std::stringstream args;
args << argv[1] << " " << argv[2];
int rows, columns;
args >> rows >> columns;
std::vector<std::vector<short>> data(rows, std::vector<short>(columns));
std::vector<int> columnSums = col_sums(data);
}
}
export OMP_NUM_THREADS=4
icpc -Ofast -fopenmp -g dummy.cpp -o dummy
/usr/bin/time -v ./dummy 115000 20000
我对OpenMP和CilkPlus有相当的经验,但这里缩放的障碍使我无法理解,这是一个相当简陋的计划。我知道它必须是明显的,但我觉得我已经消除了所有的数据危害和控制危险。我完全被难过了。