Question

我正在为uni进行一些蒙特卡罗模拟，我注意到我的结果略有不同，这取决于我是以串行方式还是并行方式运行代码。我从Weibull分布生成数据点（x [i]），计算x [i]处的（理论）CDF，然后通过KS检验将其与经验CDF（1 / n）进行比较。我做了10,000次，然后将95％的值打印到csv文件。整个过程重复100次，因此得到的csv是一列100行。我采用这些的平均值（手动在Libre Math中）并获得不同场景的以下结果

串行：平均值= 1.340 +/- 0.001这是正确的值（我有一个比较的基准）

并行：平均值= 1.345 +/- 0.002

我用g ++ 5.4.0和以下标志

进行编译

    -fopenmp -fno-inline -std=c++17 -g -Wfatal-errors -Wextra -Wall

错误边距正是我所观察到的，并且在统计上并不严谨。差异足以影响我的结论。这是真实代码的一个极其简化的版本，我的所有初步想法（例如共享的对象）都没有解决问题。

此外，我注意到如果我包含以下优化标记，我会得到相同（并且正确）的串行和并行情况。

 -Ofast

我的问题是

1）为什么我会得到不同的结果？

2）在一种情况下我怎么做才能扰乱编译器而不是另一种情况呢？

为了感兴趣，我在双核机器上运行它，它是超线程的，有4个逻辑核心。非常感谢你。

#include <iostream>
#include <cmath>
#include <thread>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h> 
#include <random>
#include <algorithm>
#include <fstream>


// INITIAL
const float scale = 400.0; const float shape = 0.577; 
const int n=100;    
const int numOfCV=100;const int numOfSims=10000;
std::random_device rd;
std::mt19937_64 e2(rd());
std::uniform_real_distribution<double> uniform_dist(0, 1);

int main(){

   // DETERMINE NUMBER OF CORES
    const unsigned Cores = std::thread::hardware_concurrency();
    std::cout<<"Cores="<<Cores<<std::endl;
    omp_set_num_threads(Cores);
    //OPEN OUTPUT FILE
    std::ofstream csvfile; csvfile.open("KS_D95.csv");

    #pragma omp parallel for // I comment this line out to run serially
    for(int CV_inc = 0; CV_inc < numOfCV; ++CV_inc){

      double x[n];
      double CDF_emp[n];
      double CDF_theory[n];
      for(int i=0; i<n ;++i){
          CDF_emp[i]=(1.0+i)/n;
      }
        double DnP[numOfSims];
        double D95;
        double dt;
        double dt0;

        int simNum = -1;
        while(simNum < numOfSims-1 ){simNum++;
            // GENERATE DATA POINTS
            for(int i=0; i<n ;++i){
                x[i] = scale*pow(-log(uniform_dist(e2)),1/shape);
            }
            // CALCULATE CDF        
            for(int i=0; i<n ;++i){
                CDF_theory[i]=1-exp(-pow(x[i]/scale,shape));
                if ( std::isinf(CDF_theory[i])) {
                    printf("CDF_theory[i]=%f \n",CDF_theory[i]);
                }
            }
            std::sort(CDF_theory,CDF_theory+n);

            // KS TEST 
            dt=0.0;dt0=0.0;
            for(int i=1; i<n ;++i){
            if (dt<std::abs(CDF_theory[i]-CDF_emp[i])){
            dt=std::abs(CDF_theory[i]-CDF_emp[i]);
                }
            if (dt0<std::abs(CDF_theory[i]-CDF_emp[i-1])){
                dt0=std::abs(CDF_theory[i]-CDF_emp[i-1]);
            }
        }
            DnP[simNum]= sqrt(n)*std::max(dt,dt0);
        }
        // PRINT RESULT
        std::sort(DnP,DnP+numOfSims);
        D95=DnP[(int)(numOfSims*0.95)-1];
        std::cout<<"line complete, KS_D95="<< D95 <<std::endl;
        csvfile<<D95<<std::endl;

    }
    return 0;
}

Answer 1

让每个线程构建自己的随机生成器版本。一种超级天真的方法是在parallel loop

中移动以下三行

    #pragma omp parallel for // I comment this line out to run serially
    for(int CV_inc = 0; CV_inc < numOfCV; ++CV_inc){
       std::random_device rd;
       std::mt19937_64 e2(rd());
       std::uniform_real_distribution<double> uniform_dist(0, 1);

这样每个线程都有一个独立的随机数生成器。

使用这个我得到以下结果（在8个核心上并行）：

 ------------------------------
|  RUN |  SERIAL    | PARALLEL |
|   1  | 1.3407766  | 1.3405413|    
|   2  | 1.3398708  | 1.3400575|
|   3  | 1.3404915  | 1.3414703|
|   4  | 1.3406889  | 1.3402073|
--------------------------------

初始化这些内容的更好方法是分别使用parallel区域和for，如下所示：

#pragma omp parallel
  {
    std::random_device rd;
    std::mt19937_64 e2(rd());
    std::uniform_real_distribution<double> uniform_dist(0, 1);
#pragma omp for  // I comment this line out to run serially
    for (int CV_inc = 0; CV_inc < numOfCV; ++CV_inc) {

这里发生的事情我想是随机生成器使用每个线程的相同种子进行初始化，因此每个线程的每个独立调用号i生成相同的数字。这意味着您在所有线程上使用相同的数字序列。每个线程都需要不同的种子。

并行和串行实现（和编译器设置）之间的仿真结果不同

1 个答案: