我是并行编程的新手。 根据示例代码,有人可以解释为什么使用OpenMP部分运行比单个线程慢?有什么建议可以改善吗?
#include<iostream>
#include <vector>
#include <chrono>
#include <numeric>
#include<omp.h>
using namespace std;
int Calculation_1(int A, int B);
int Calculation_2(int A, int B);
int Calculation_3(int A, int B);
int Calculation_4(int A, int B);
int main() {
vector<int>W;
vector<int>X;
vector<int>Y;
vector<int>Z;
chrono::steady_clock::time_point begin1 = std::chrono::steady_clock::now();
omp_set_num_threads(4);
#pragma omp parallel
{
#pragma omp sections nowait
{
#pragma omp section
{
W.push_back(Calculation_1(5, 5));
}
#pragma omp section
{
X.push_back(Calculation_2(5, 5));
}
#pragma omp section
{
Y.push_back(Calculation_3(5, 5));
}
#pragma omp section
{
Z.push_back(Calculation_4(5, 5));
}
}
}
cout << "Parallel = " << accumulate(W.begin(), W.end(), 0) + accumulate(X.begin(), X.end(), 0) + accumulate(Y.begin(), Y.end(), 0) + accumulate(Z.begin(), Z.end(), 0) << endl;;
chrono::steady_clock::time_point end1 = std::chrono::steady_clock::now();
cout << "Time difference = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1).count() << std::endl;
//Clear vector
W.clear();
X.clear();
Y.clear();
Z.clear();
////Sigle
chrono::steady_clock::time_point begin2 = std::chrono::steady_clock::now();
W.push_back(Calculation_1(5, 5));
X.push_back(Calculation_2(5, 5));
Y.push_back(Calculation_3(5, 5));
Z.push_back(Calculation_4(5, 5));
cout << "single = " << accumulate(W.begin(), W.end(), 0) + accumulate(X.begin(), X.end(), 0) + accumulate(Y.begin(), Y.end(), 0) + accumulate(Z.begin(), Z.end(), 0) << endl;
chrono::steady_clock::time_point end2 = std::chrono::steady_clock::now();
cout << "Time difference = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end2 - begin2).count() << std::endl;
cin.get();
return 0;
}
int Calculation_1(int A, int B) {
return A + B;
}
int Calculation_2(int A, int B) {
return A + B;
}
int Calculation_3(int A, int B) {
return A + B;
}
int Calculation_4(int A, int B) {
return A + B;
}
结果如下: 平行= 40 时间= 9168172
单身= 40 时间225580
平行的比单一的慢40倍。
//我也试图根据建议(下面的代码)将很多数字推入向量。结果是:(平行的一个比单个慢9倍。)。
并行 时间= 12907862
单 时间= 1334519
chrono::steady_clock::time_point begin1 = std::chrono::steady_clock::now();
omp_set_num_threads(2);
#pragma omp parallel
{
#pragma omp sections nowait
{
#pragma omp section
{
for (int i = 0; i < 100000; i++) {
X.push_back(i);
}
}
#pragma omp section
{
for (int j = 0; j < 100000; j++) {
Y.push_back(j);
}
}
}
}
cout << "Parallel = " << accumulate(X.begin(), X.end(), 0) + accumulate(Y.begin(), Y.end(), 0) << endl;;
chrono::steady_clock::time_point end1 = std::chrono::steady_clock::now();
cout << "Time difference = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1).count() << std::endl;
//Clear vector
X.clear();
Y.clear();
////Sigle
chrono::steady_clock::time_point begin2 = std::chrono::steady_clock::now();
for (int i = 0; i < 100000; i++) {
X.push_back(i);
}
for (int j = 0; j < 100000; j++) {
Y.push_back(j);
}
cout << "single = " << accumulate(X.begin(), X.end(), 0) + accumulate(Y.begin(), Y.end(), 0) << endl;
chrono::steady_clock::time_point end2 = std::chrono::steady_clock::now();
cout << "Time difference = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end2 - begin2).count() << std::endl;
非常感谢,
答案 0 :(得分:1)
另请注意,对于这种简单的计算,发送线程 的时间 可能会更大,只需在单个线程中计算它们。 另外,正如user0042所说,如果 垃圾邮件比核心更多线程 有你的计算机,他们将开始安排资源(核心)并共享它们,进入和退出也会减慢计算
答案 1 :(得分:0)
要真正并行计算内容(运行线程彼此独立),您需要拥有与线程关联的专用CPU内核。
如果您拥有的线程多于可用的CPU内核,则只会引入有关线程创建和调度的开销。这很可能是你的代码放慢速度的原因。