优化MD计算的循环

时间:2016-10-26 11:06:36

标签: c++ optimization openmp

我有以下循环用于MD模拟中的力计算:

struct data_str{
    int a = 0, b = 0;
    data_str(int a, int b) : a(a), b(b)
    {

    };
};

void work_func_both(std::vector<data_str> *data, int dist)
{
    for(unsigned int i = 0; i < data->size()-1; i++)
    {
        for(unsigned int j = i+1; j < data->size(); j++)
        {
            if(abs((*data)[i].a - (*data)[j].a) < dist)
                {
                    (*data)[i].b += 3;
                    (*data)[j].b -= 1;
                }
        }
    }
}

它是我在我的应用程序中使用的力计算循环的简化版本。我的主要问题是:需要时间,特别是对于较大的向量来计算一切。因此,我想对其进行优化,并测试以下程序:

#include <iostream>
#include <omp.h>
#include <vector>
#include <chrono>

#define VECTOR_SIZE 1e3
#define ROUNDS 1e5

struct data_str{
    int a = 0, b = 0;
    data_str(int a, int b) : a(a), b(b)
    {

    };
};

void work_func_both(std::vector<data_str> *data, int dist)
{
    #pragma omp parallel for
    for(unsigned int i = 0; i < data->size()-1; i++)
    {
        #pragma omp parallel for
        for(unsigned int j = i+1; j < data->size(); j++)
        {
            if(abs((*data)[i].a - (*data)[j].a) < dist)
                {
                    (*data)[i].b += 3;
                    (*data)[j].b -= 1;
                }
        }
    }
}

void work_func_first(std::vector<data_str> *data, int dist)
{
    #pragma omp parallel for
    for(unsigned int i = 0; i < data->size()-1; i++)
    {
        for(unsigned int j = i+1; j < data->size(); j++)
        {
            if(abs((*data)[i].a - (*data)[j].a) < dist)
                {
                    (*data)[i].b += 3;
                    (*data)[j].b -= 1;
                }
        }
    }
}

void work_func_second(std::vector<data_str> *data, int dist)
{
    for(unsigned int i = 0; i < data->size()-1; i++)
    {
        #pragma omp parallel for
        for(unsigned int j = i+1; j < data->size(); j++)
        {
            if(abs((*data)[i].a - (*data)[j].a) < dist)
                {
                    (*data)[i].b += 3;
                    (*data)[j].b -= 1;
                }
        }
    }
}

void work_func_none(std::vector<data_str> *data, int dist)
{
    for(unsigned int i = 0; i < data->size()-1; i++)
    {
        for(unsigned int j = i+1; j < data->size(); j++)
        {
            if(abs((*data)[i].a - (*data)[j].a) < dist)
                {
                    (*data)[i].b += 3;
                    (*data)[j].b -= 1;
                }
        }
    }
}


int main(void)
{
    std::vector<data_str> counter_vec1, counter_vec2, counter_vec3, counter_vec4;

    for(unsigned int i = 0; i < VECTOR_SIZE; i++)
    {
        counter_vec1.push_back(data_str(i, i));
        counter_vec2.push_back(data_str(i, i));
        counter_vec3.push_back(data_str(i, i));
        counter_vec4.push_back(data_str(i, i));
    }
    omp_set_num_threads(8);
    const int length = 2;
    std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        work_func_both(&counter_vec1, length);
    }
    std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
    auto duration_both = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();

    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        work_func_first(&counter_vec2, length);
    }
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_first = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();

    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        work_func_second(&counter_vec3, length);
    }
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_second = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();

    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        work_func_none(&counter_vec4, length);
    }
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_none = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();

    std::cout << "Both took " << duration_both/ROUNDS << ", first took " << duration_first/ROUNDS << ", second took " << duration_second/ROUNDS << " and none took " << duration_none/ROUNDS << '\n';
    return 0;
};

编译行是

 g++-5 -std=c++14 -O3 -fopenmp main.cpp -lgomp -o main

,结果输出为

Both took 593.704, first took 547.549, second took 3394.53 and none took 856.049

我现在的问题是:为什么只有_second() - 函数在与原始版本比较时返回正确的结果,为什么它与所有其他循环相比要慢得多?我怎样才能优化该循环以使其总体上更快?

0 个答案:

没有答案