Question

直到最近，并行编程的前景引起了我的注意。从那以后，我使用了各种并行编程库。也许是我的第一站是英特尔螺纹构建模块（TBB）。但是，由于诸如Round-Offs之类的因素以及这些程序在不同处理器架构中的不可预测行为而导致的错误往往成为瓶颈。下面是一段代码，用于计算两组值的Pearsons相关系数。它采用了TBB的基本并行模式 - * parallel_for *和* parallel_reduce *：

    // A programme to calculate Pearsons Correlation coefficient 

#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <tbb/task_scheduler_init.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/blocked_range.h>
#include <tbb/tick_count.h>




using namespace std;
using namespace tbb;
const size_t n=100000;
double global=0;

namespace s //Namesapce for serial part
{
double *a,*b;
int j;
double mean_a,mean_b,sd_a=0,sd_b=0,pcc=0;
double sum_a,sum_b,i;
}

namespace p //Namespace for parallel part
{
double *a,*b;
double mean_a,mean_b,pcc;
double sum_a,sum_b,i;
double sd_a,sd_b;
}


class serials
{
public:
               void computemean_serial()
               {
                using namespace s;
            sum_a=0,sum_b=0,i=0;
                a=(double*) malloc(n*sizeof(double));
                b=(double*) malloc(n*sizeof(double));
                for(j=0;j<n;j++,i++)
                { 
                    a[j]=sin(i);
                    b[j]=cos(i);

                    sum_a=sum_a+a[j];
                    sum_b=sum_b+b[j];
                }
                mean_a=sum_a/n;
            mean_b=sum_b/n;
                cout<<"\nMean of a :"<<mean_a;
                cout<<"\nMean of b :"<<mean_b;
               }
               void computesd_serial()
               {
               using namespace s;
               for(j=0;j<n;j++)
               {sd_a=sd_a+pow((a[j]-mean_a),2);
                sd_b=sd_b+pow((b[j]-mean_b),2);
               }
                sd_a=sd_a/n;
               sd_a=sqrt(sd_a);
               sd_b=sd_b/n;
               sd_b=sqrt(sd_b);
               cout<<"\nStandard deviation of a :"<<sd_a;
               cout<<"\nStandard deviation of b :"<<sd_b;
               }
               void pearson_correlation_coefficient_serial()
               {
                using namespace s;
                pcc=0;
                for(j=0;j<n;j++)
                {
                pcc+=(a[j]-mean_a)*(b[j]-mean_b);
                }
                pcc=pcc/(n*sd_a*sd_b);
                cout<<"\nPearson Correlation Coefficient: "<<pcc;
               }

};


class parallel
{
public:

class compute_mean 
{

double *store1,*store2;
public: 

double mean_a,mean_b;

    void operator()( const blocked_range<size_t>& r)
    {
    double *a= store1;
    double *b= store2;

    for(size_t i =r.begin();i!=r.end(); ++i)
    {    
         mean_a+=a[i];
         mean_b+=b[i];
    }
    }
    compute_mean( compute_mean& x, split) : store1(x.store1),store2(x.store2),mean_a(0),mean_b(0){}

    void join(const compute_mean& y) {mean_a+=y.mean_a;mean_b+=y.mean_b;}
    compute_mean(double* a,double* b): store1(a),store2(b),mean_a(0),mean_b(0){}
};

               class read_array
                {
               double *const a,*const b;

                 public:

             read_array(double* vec1, double* vec2) : a(vec1),b(vec2){}  // constructor copies the arguments into local store 
             void operator() (const blocked_range<size_t> &r) const {              // opration to be used in parallel_for 

                     for(size_t k = r.begin(); k!=r.end(); k++,global++)
                     {   
                         a[k]=sin(global);
                         b[k]=cos(global);
                     }

                 }};

            void computemean_parallel()
                        {
                        using namespace p;
                        i=0;
                        a=(double*) malloc(n*sizeof(double));
                        b=(double*) malloc(n*sizeof(double));

                parallel_for(blocked_range<size_t>(0,n,5000),read_array(a,b));
                compute_mean sf(a,b);
                parallel_reduce(blocked_range<size_t>(0,n,5000),sf);
                mean_a=sf.mean_a/n;
                mean_b=sf.mean_b/n;
                cout<<"\nMean of a :"<<mean_a;
                cout<<"\nMean of b :"<<mean_b;
               }

class compute_sd 
{
double *store1,*store2;
double store3,store4;
public: 
double sd_a,sd_b,dif_a,dif_b,temp_pcc;
void operator()( const blocked_range<size_t>& r)
{
    double *a= store1;
    double *b= store2;
    double mean_a=store3;
    double mean_b=store4;
    for(size_t i =r.begin();i!=r.end(); ++i)
    { 
     dif_a=a[i]-mean_a;
     dif_b=b[i]-mean_b;
     temp_pcc+=dif_a*dif_b;
     sd_a+=pow(dif_a,2);
     sd_b+=pow(dif_b,2);
    }}
    compute_sd( compute_sd& x, split) : store1(x.store1),store2(x.store2),store3(p::mean_a),store4(p::mean_b),sd_a(0),sd_b(0),temp_pcc(0){}
    void join(const compute_sd& y) {sd_a+=y.sd_a;sd_b+=y.sd_b;}
    compute_sd(double* a,double* b,double mean_a,double mean_b): store1(a),store2(b),store3(mean_a),store4(mean_b),sd_a(0),sd_b(0),temp_pcc(0){}
};


               void computesd_and_pearson_correlation_coefficient_parallel()
               {
               using namespace p;
               compute_sd obj2(a,b,mean_a,mean_b);
               parallel_reduce(blocked_range<size_t>(0,n,5000),obj2);
               sd_a=obj2.sd_a;
               sd_b=obj2.sd_b;
               sd_a=sd_a/n;
               sd_a=sqrt(sd_a);
               sd_b=sd_b/n;
               sd_b=sqrt(sd_b);
               cout<<"\nStandard deviation of a :"<<sd_a;
               cout<<"\nStandard deviation of b :"<<sd_b;
               pcc=obj2.temp_pcc;
               pcc=pcc/(n*sd_a*sd_b);
               cout<<"\nPearson Correlation Coefficient: "<<pcc;
               }
};

main()
{       
        serials obj_s;
        parallel obj_p;
        cout<<"\nSerial Part";
        cout<<"\n-----------";
        tick_count start_s=tick_count::now();
        obj_s.computemean_serial();
        obj_s.computesd_serial();
        obj_s.pearson_correlation_coefficient_serial();
        tick_count end_s=tick_count::now();
        cout<<"\n";
        task_scheduler_init init;
        cout<<"\nParallel Part";
        cout<<"\n-------------";
        tick_count start_p=tick_count::now();
        obj_p.computemean_parallel();
        obj_p.computesd_and_pearson_correlation_coefficient_parallel();
        tick_count end_p=tick_count::now();
        cout<<"\n";
        cout<<"\nTime Estimates";
        cout<<"\n--------------";
        cout<<"\nSerial Time :"<<(end_s-start_s).seconds()<<" Seconds";
        cout<<"\nParallel time :"<<(end_p-start_p).seconds()<<" Seconds\n";

}

好吧！它在内置Core i5的Windows机器上运行良好。它为输出中的每个参数提供了绝对相同的值，并行代码歧管比串行代码更快。这是我的输出：

操作系统：Windows 7旗舰版64位处理器：核心i5

Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07

Parallel Part
-------------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07

Time Estimates
--------------
Serial Time : 0.0204829 Seconds
Parallel Time : 0.00939971 Seconds

那么其他机器呢？如果我说它会工作正常，那么至少我的一些朋友会说“等待伴侣！有些东西可疑。”虽然并行代码总是比串行代码快，但在不同机器中的答案（由并行代码和串行代码生成的代码之间）存在细微差别。是什么造成了这些差异？我们遇到的这种异常行为的结论是四舍五入的错误，其代价是过度的并行性和处理器架构的差异。

这引出了我的问题：

当我们使用并行时，我们需要采取哪些预防措施在我们的代码中处理库以利用多核处理器？
我们不应该使用并行方法的情况是什么虽然有多个处理器可用吗？
我们可以做些什么来避免圆整错误？（让我说指出我不是在谈论强制执行互斥和障碍可能有时会限制并行性的扩展但是简单的编程技巧，有时候很方便）

我很高兴看到您对这些问题的建议。请随意回答如果你有时间限制，最适合你的部分。

编辑 - 我在此处添加了更多结果

操作系统：Linux Ubuntu 64位处理器：核心i5

    Serial Part
    -----------
    Mean of a :1.81203e-05
    Mean of b :1.0324e-05
    Standard deviation of a :0.707107
    Standard deviation of b :0.707107
    Pearson Correlation Coefficient: 3.65091e-07

    Parallel Part
    -------------
    Mean of a :-0.000233041
    Mean of b :0.00414375
    Standard deviation of a :2.58428
    Standard deviation of b :54.6333
    Pearson Correlation Coefficient: -0.000538456

    Time Estimates
    --------------
    Serial Time :0.0161237 Seconds
    Parallel Time :0.0103125 Seconds

操作系统：Linux Fedora 64位处理器：核心i3

Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07

Parallel Part
-------------
Mean of a :-0.00197118
Mean of b :0.00124329
Standard deviation of a :0.707783
Standard deviation of b :0.703951
Pearson Correlation Coefficient: -0.129055

Time Estimates
--------------
Serial Time :0.02257 Seconds
Parallel Time :0.0107966 Seconds

修改：更改后的提示

操作系统：Linux Ubuntu 64位处理器：corei5

Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07

Parallel Part
-------------
Mean of a :-0.000304446
Mean of b :0.00172593
Standard deviation of a :0.708465
Standard deviation of b :0.7039
Pearson Correlation Coefficient: -0.140716

Time Estimates
--------------
Serial Time :0.0235391 Seconds
Parallel time :0.00810775 Seconds

最诚挚的问候。

注1：我不保证上面的代码是正确的。我相信。

注意2：这段代码也在Linux机器上进行了测试。

注3：尝试了不同的粒度组合和自动分区选项。

Answer 1

我对/*,mean_a(0),mean_b(0)*/构造函数中注释掉的compute_mean( compute_mean& x, split)深感怀疑。似乎您的差异可能来自未初始化的数据污染结果。我猜测你得到一致结果的机器，没有发生任务分裂，或者那些成员恰好是零内存。

同样，您的compute_sd( compute_sd& x, split)离开了store3和store4未初始化。

Answer 2

这引出了我的问题：

当我们在代码中使用并行处理库以利用多核处理器时，我们需要采取哪些预防措施？

除了时间回答中的要点之外，你的问题似乎并不是并行性的。用浮点数计算的稳定算法很难设计;有效使用并行性所固有的较低的确定性暴露了算法不足的问题。请参阅下文，了解我的意思。在决定并行性或算法是否导致数值不稳定之前，您应该测试串行代码相对于输入数据顺序的稳健性。

即使有多个处理器可用，我们也不应该使用并行方法？

当循环中的操作不足以支付开销时。这取决于算法，硬件和问题大小。

我们可以做些什么来避免舍入错误？（让我指出我不是在谈论强制执行互斥和障碍，这可能有时会限制并行性的扩展但是关于简单的编程技巧这有时候很方便）

无论是编写串行代码还是并行代码，都应该使用专为数值稳定性而设计的算法。那些你在高中被教的人是为了便于理解而设计的！ :-)例如，请参阅http://en.m.wikipedia.org/wiki/Algorithms_for_calculating_variance。

采用TBB的并行性 - 我们的检查清单应该是什么？

2 个答案: