我有以下循环用于MD模拟中的力计算:
struct data_str{
int a = 0, b = 0;
data_str(int a, int b) : a(a), b(b)
{
};
};
void work_func_both(std::vector<data_str> *data, int dist)
{
for(unsigned int i = 0; i < data->size()-1; i++)
{
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
它是我在我的应用程序中使用的力计算循环的简化版本。我的主要问题是:需要时间,特别是对于较大的向量来计算一切。因此,我想对其进行优化,并测试以下程序:
#include <iostream>
#include <omp.h>
#include <vector>
#include <chrono>
#define VECTOR_SIZE 1e3
#define ROUNDS 1e5
struct data_str{
int a = 0, b = 0;
data_str(int a, int b) : a(a), b(b)
{
};
};
void work_func_both(std::vector<data_str> *data, int dist)
{
#pragma omp parallel for
for(unsigned int i = 0; i < data->size()-1; i++)
{
#pragma omp parallel for
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
void work_func_first(std::vector<data_str> *data, int dist)
{
#pragma omp parallel for
for(unsigned int i = 0; i < data->size()-1; i++)
{
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
void work_func_second(std::vector<data_str> *data, int dist)
{
for(unsigned int i = 0; i < data->size()-1; i++)
{
#pragma omp parallel for
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
void work_func_none(std::vector<data_str> *data, int dist)
{
for(unsigned int i = 0; i < data->size()-1; i++)
{
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
int main(void)
{
std::vector<data_str> counter_vec1, counter_vec2, counter_vec3, counter_vec4;
for(unsigned int i = 0; i < VECTOR_SIZE; i++)
{
counter_vec1.push_back(data_str(i, i));
counter_vec2.push_back(data_str(i, i));
counter_vec3.push_back(data_str(i, i));
counter_vec4.push_back(data_str(i, i));
}
omp_set_num_threads(8);
const int length = 2;
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < ROUNDS; i++)
{
work_func_both(&counter_vec1, length);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
auto duration_both = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
t1 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < ROUNDS; i++)
{
work_func_first(&counter_vec2, length);
}
t2 = std::chrono::high_resolution_clock::now();
auto duration_first = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
t1 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < ROUNDS; i++)
{
work_func_second(&counter_vec3, length);
}
t2 = std::chrono::high_resolution_clock::now();
auto duration_second = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
t1 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < ROUNDS; i++)
{
work_func_none(&counter_vec4, length);
}
t2 = std::chrono::high_resolution_clock::now();
auto duration_none = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
std::cout << "Both took " << duration_both/ROUNDS << ", first took " << duration_first/ROUNDS << ", second took " << duration_second/ROUNDS << " and none took " << duration_none/ROUNDS << '\n';
return 0;
};
编译行是
g++-5 -std=c++14 -O3 -fopenmp main.cpp -lgomp -o main
,结果输出为
Both took 593.704, first took 547.549, second took 3394.53 and none took 856.049
我现在的问题是:为什么只有_second()
- 函数在与原始版本比较时返回正确的结果,为什么它与所有其他循环相比要慢得多?我怎样才能优化该循环以使其总体上更快?