LLVM / OpenMP中的互斥速度非常慢

时间:2017-02-22 15:45:51

标签: c++ llvm openmp mutex

我编写了代码来测试openmp on win(Win7 x64,Corei7 3.4HGz)和Mac(10.12.3 Core i7 2.7 HGz)的性能。

  1. 在xcode中,我创建了一个控制台应用程序来设置编译的默认值。我在macos 10.12.3(CPU-Corei7 2700GHz)上使用LLVM 3.7和OpenMP 5(在opm.h中搜索定义KMP_VERSION_MAJOR = 5,定义KMP_VERSION_MINOR = 0和KMP_VERSION_BUILD = 20150701,libiopm5)
  2. 为了赢,我使用VS2010 Sp1。另外我设置了c / C ++ - >优化 - >优化=最大化速度(O2),c / C ++ - >优化 - >青睐Soze或Speed =青睐快速代码(Ot)。
  3. 如果我在单个线程中运行应用程序,则时间差对应于处理器的频率比(大约)。但是如果你运行4个线程,差异变得明显:win程序比mac程序快70倍。

    #include <cmath>
    #include <mutex>
    #include <cstdint>
    #include <cstdio>
    #include <iostream>
    #include <omp.h>
    #include <boost/chrono/chrono.hpp>
    
    static double ActionWithNumber(double number)
    {
        double sum = 0.0f;
        for (std::uint32_t i = 0; i < 50; i++)
        {
            double coeff = sqrt(pow(std::abs(number), 0.1));
            double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
            sum += sqrt(res);
        }
        return sum;
    }
    
    static double TestOpenMP(void)
    {
        const std::uint32_t len = 4000000;
        double *a;
        double *b;
        double *c;
        double sum = 0.0;
    
        std::mutex _mutex;
        a = new double[len];
        b = new double[len];
        c = new double[len];
    
        for (std::uint32_t i = 0; i < len; i++)
        {
            c[i] = 0.0;
            a[i] = sin((double)i);
            b[i] = cos((double)i);
        }
        boost::chrono::time_point<boost::chrono::system_clock> start, end;
        start = boost::chrono::system_clock::now();
        double k = 2.0;
        omp_set_num_threads(4);
    #pragma omp parallel for 
        for (int i = 0; i < len; i++)
        {
            c[i] = k*a[i] + b[i] + k;
            if (c[i] > 0.0)
            {
                c[i] += ActionWithNumber(c[i]);
            }
            else
            {
                c[i] -= ActionWithNumber(c[i]);
            }
            std::lock_guard<std::mutex> scoped(_mutex);
            sum += c[i];
        }
        end = boost::chrono::system_clock::now();
        boost::chrono::duration<double> elapsed_time = end - start;
        double sum2 = 0.0;
        for (std::uint32_t i = 0; i < len; i++)
        {
            sum2 += c[i];
            c[i] /= sum2;
        }
        if (std::abs(sum - sum2) > 0.01) printf("Incorrect result.\n");
        delete[] a;
        delete[] b;
        delete[] c;
        return elapsed_time.count();
    }
    
    int main()
    {
    
        double sum = 0.0;
        const std::uint32_t steps = 5;
        for (std::uint32_t i = 0; i < steps; i++)
        {
            sum += TestOpenMP();
        }
        sum /= (double)steps;
        std::cout << "Elapsed time = " <<  sum;
        return 0;
    }
    

    我在这里专门使用互斥来比较openmp在“mac”和“win”上的表现。在“Win”函数返回0.39秒的时间。在“Mac”功能上返回25秒的时间,即慢70倍。

    造成这种差异的原因是什么?

    首先,感谢编辑我的帖子(我用翻译来写文字)。 在真实应用程序中,我以随机顺序更新巨大矩阵(20000х20000)中的值。每个线程确定新值并将其写入特定单元格。我为每一行创建了一个互斥锁,因为在大多数情况下,不同的线程会写入不同的行。但显然在2个线程写入一行并且存在长锁的情况下。目前我无法在不同的线程中划分行,因为记录的顺序由FEM元素决定。 因此,只需在其中放置一个关键部分,因为它将阻止对整个矩阵的写入。

    我在实际应用程序中编写了代码。

    static double ActionWithNumber(double number)
    {
        const unsigned int steps = 5000;
        double sum = 0.0f;
        for (u32 i = 0; i < steps; i++)
        {
            double coeff = sqrt(pow(abs(number), 0.1));
            double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
            sum += sqrt(res);
        }
        sum /= (double)steps;
        return sum;
    }
    
    static double RealAppTest(void)
    {
        const unsigned int elementsNum = 10000;
        double* matrix;
        unsigned int* elements;
        boost::mutex* mutexes;
    
        elements = new unsigned int[elementsNum*3];
        matrix = new double[elementsNum*elementsNum];
        mutexes = new boost::mutex[elementsNum];
        for (unsigned int i = 0; i < elementsNum; i++)
            for (unsigned int j = 0; j < elementsNum; j++)
                matrix[i*elementsNum + j] = (double)(rand() % 100);
        for (unsigned int i = 0; i < elementsNum; i++) //build FEM element like Triangle
        {
            elements[3*i] = rand()%(elementsNum-1);
            elements[3*i+1] = rand()%(elementsNum-1);
            elements[3*i+2] = rand()%(elementsNum-1);
        }
        boost::chrono::time_point<boost::chrono::system_clock> start, end;
        start = boost::chrono::system_clock::now();
        omp_set_num_threads(4);
    #pragma omp parallel for
        for (int i = 0; i < elementsNum; i++)
        {
            unsigned int* elems = &elements[3*i];
            for (unsigned int j = 0; j < 3; j++)
            {
                //in here set mutex for  row with index = elems[j];
                boost::lock_guard<boost::mutex> lockup(mutexes[i]);
                double res = 0.0;
                for (unsigned int k = 0; k < 3; k++)
                {
                    res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
                }
                for (unsigned int k = 0; k < 3; k++)
                {
                    matrix[elems[j]*elementsNum + elems[k]] = res;
                }
            }
        }
        end = boost::chrono::system_clock::now();
        boost::chrono::duration<double> elapsed_time = end - start;
    
        delete[] elements;
        delete[] matrix;
        delete[] mutexes;
        return elapsed_time.count();
    }
    
    int main()
    {
        double sum = 0.0;
        const u32 steps = 5;
        for (u32 i = 0; i < steps; i++)
        {
            sum += RealAppTest();
        }
        sum /= (double)steps;
        std::cout<<"Elapsed time = " <<  sum;
        return 0; 
    }
    

1 个答案:

答案 0 :(得分:3)

您正在组合两组不同的线程/同步原语 - OpenMP,它内置于编译器中并具有运行时系统,并使用std :: mutex手动创建posix互斥锁。对某些编译器/操作系统组合存在一些互操作性问题可能就不足为奇了。

我的猜测是,在缓慢的情况下,OpenMP运行时过度,以确保更高级别的正在进行的OpenMP线程任务与手动互斥锁之间没有交互,并且在紧密循环中这样做会导致戏剧性减速。

对于OpenMP框架中的类似互斥体的行为,我们可以使用关键部分:

#pragma omp parallel for 
for (int i = 0; i < len; i++)
{
    //...
    // replacing this: std::lock_guard<std::mutex> scoped(_mutex);
    #pragma omp critical
    sum += c[i];
}

或显式锁:

omp_lock_t sumlock;
omp_init_lock(&sumlock);
#pragma omp parallel for 
for (int i = 0; i < len; i++)
{
    //...
    // replacing this: std::lock_guard<std::mutex> scoped(_mutex);
    omp_set_lock(&sumlock);
    sum += c[i];
    omp_unset_lock(&sumlock);
}
omp_destroy_lock(&sumlock);

我们得到更合理的时间:

$ time ./openmp-original
real    1m41.119s
user    1m15.961s
sys 1m53.919s

$ time ./openmp-critical
real    0m16.470s
user    1m2.313s
sys 0m0.599s

$ time ./openmp-locks
real    0m15.819s
user    1m0.820s
sys 0m0.276s

更新:使用openmp锁数组与互斥锁完全相同没有问题:

omp_lock_t sumlocks[elementsNum];
for (unsigned idx=0; idx<elementsNum; idx++) 
    omp_init_lock(&(sumlocks[idx]));

//...
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
    unsigned int* elems = &elements[3*i];
    for (unsigned int j = 0; j < 3; j++)
    {
        //in here set mutex for  row with index = elems[j];
        double res = 0.0;
        for (unsigned int k = 0; k < 3; k++)
        {
            res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
        }
        omp_set_lock(&(sumlocks[i]));
        for (unsigned int k = 0; k < 3; k++)
        {
            matrix[elems[j]*elementsNum + elems[k]] = res;
        }
        omp_unset_lock(&(sumlocks[i]));
    }
}
for (unsigned idx=0; idx<elementsNum; idx++) 
    omp_destroy_lock(&(sumlocks[idx]));