我正在尝试使用OpenMP以并行处理形式实现一个过程。它包含四个嵌套的for循环(依赖),并且在最里面的循环中有一个变量sum_p
。简而言之,我的问题是关于以下代码片段的并行实现:
for (int i = (test_map.size() - 1); i >= 1; --i) {
bin_i = test_map.at(i); //test_map is a "STL map of vectors"
len_rank_bin_i = bin_i.size(); // bin_i is a vector
for (int j = (i - 1); j >= 0; --j) {
bin_j = test_map.at(j);
len_rank_bin_j = bin_j.size();
for (int u_i = 0; u_i < len_rank_bin_i; u_i++) {
node_u = bin_i[u_i]; //node_u is a scalar
for (int v_i = 0; v_i < len_rank_bin_j; v_i++) {
node_v = bin_j[v_i];
if (node_u> node_v)
sum_p += 1;
}
}
}
}
完整的计划如下:
#include <iostream>
#include <vector>
#include <omp.h>
#include <random>
#include <unordered_map>
#include <algorithm>
#include <functional>
#include <time.h>
int main(int argc, char* argv[]){
double time_temp;
int test_map_size = 5000;
std::unordered_map<unsigned int, std::vector<unsigned int> > test_map(test_map_size);
// Fill the test map with random intergers ---------------------------------
std::random_device rd;
std::mt19937 gen1(rd());
std::uniform_int_distribution<int> dist(1, 5);
auto gen = std::bind(dist, gen1);
for(int i = 0; i < test_map_size; i++)
{
int vector_len = dist(gen1);
std::vector<unsigned int> tt(vector_len);
std::generate(begin(tt), end(tt), gen);
test_map.insert({i,tt});
}
// Sequential implementation -----------------------------------------------
time_temp = omp_get_wtime();
std::vector<unsigned int> bin_i, bin_j;
unsigned int node_v, node_u;
unsigned int len_rank_bin_i;
unsigned int len_rank_bin_j;
int sum_s = 0;
for (unsigned int i = (test_map_size - 1); i >= 1; --i) {
bin_i = test_map.at(i);
len_rank_bin_i = bin_i.size();
for (unsigned int j = i; j-- > 0; ) {
bin_j = test_map.at(j);
len_rank_bin_j = bin_j.size();
for (unsigned int u_i = 0; u_i < len_rank_bin_i; u_i++) {
node_u = bin_i[u_i];
for (unsigned int v_i = 0; v_i < len_rank_bin_j; v_i++) {
node_v = bin_j[v_i];
if (node_u> node_v)
sum_s += 1;
}
}
}
}
std::cout<<"Estimated sum (seq): "<<sum_s<<std::endl;
time_temp = omp_get_wtime() - time_temp;
printf("Time taken for sequential implementation: %.2fs\n", time_temp);
// Parallel implementation -----------------------------------------------
time_temp = omp_get_wtime();
int sum_p = 0;
omp_set_num_threads(4);
#pragma omp parallel
{
std::vector<unsigned int> bin_i, bin_j;
unsigned int node_v, node_u;
unsigned int len_rank_bin_i;
unsigned int len_rank_bin_j;
unsigned int i, u_i, v_i;
int j;
#pragma omp parallel for private(j,u_i,v_i) reduction(+:sum_p)
for (i = (test_map_size - 1); i >= 1; --i) {
bin_i = test_map.at(i);
len_rank_bin_i = bin_i.size();
#pragma omp parallel for private(u_i,v_i)
for (j = (i - 1); j >= 0; --j) {
bin_j = test_map.at(j);
len_rank_bin_j = bin_j.size();
#pragma omp parallel for private(v_i)
for (u_i = 0; u_i < len_rank_bin_i; u_i++) {
node_u = bin_i[u_i];
#pragma omp parallel for
for (v_i = 0; v_i < len_rank_bin_j; v_i++) {
node_v = bin_j[v_i];
if (node_u> node_v)
sum_p += 1;
}
}
}
}
}
std::cout<<"Estimated sum (parallel): "<<sum_p<<std::endl;
time_temp = omp_get_wtime() - time_temp;
printf("Time taken for parallel implementation: %.2fs\n", time_temp);
return 0;
}
在macOS 10.13.3(具有四个逻辑核心的i5处理器)中使用命令g++-7 -fopenmp -std=c++11 -O3 -Wall -o so_qn so_qn.cpp
运行代码,提供以下输出:
Estimated sum (seq): 38445750
Time taken for sequential implementation: 0.49s
Estimated sum (parallel): 38445750
Time taken for parallel implementation: 50.54s
并行实施所花费的时间比顺序实施高出数倍。您认为代码或逻辑可以推导为并行实现吗?我花了几天时间来改善代码的可怕性能,但无济于事。任何帮助是极大的赞赏。
更新
随着JimCownie建议的改变,即&#34;使用omp,而不是omp parallel for&#34;并且去除内环的平行,性能大大提高。
Estimated sum (seq): 42392944
Time taken for sequential implementation: 0.48s
Estimated sum (parallel): 42392944
Time taken for parallel implementation: 0.27s
我的CPU有四个逻辑核心(我正在使用四个线程),现在我想知道,无论如何都会有比连续实现好四倍的性能。
当我的矢量test_map
的地图很短但在每个级别都很胖时,我看到了一个不同的问题,即地图大小很小,但每个键的矢量大小非常大。在这种情况下,顺序和并行实现的性能是可比较的,没有太大差别。看起来我们也需要并行化内部循环。你知道如何在这种背景下实现它吗?