使用Rcpp和openMP从截断正态分布快速采样

时间:2013-07-29 02:33:25

标签: r openmp rcpp

更新:

我试图实施Dirk的建议。评论? 我现在正忙于JSM,但我想在为画廊编织Rmd之前得到一些反馈。 我从犰狳换回正常的Rcpp,因为它没有增加任何价值。 带有R ::的标量版本非常好。 如果将mean / sd作为标量输入,而不是作为所需输出长度的向量,我应该在参数n中输入绘制数量。


有许多MCMC应用程序需要从截断的Normal分布中绘制样本。我建立了TN的现有实现,并添加了并行计算。

的问题:

  1. 有没有人看到进一步的速度提升? 在基准测试的最后一种情况下,rtruncnorm有时会更快。 Rcpp实现总是比现有的包更快,但是可以进一步改进吗?
  2. 我在一个我无法分享的复杂模型中运行它,我的R会话崩溃了。但是,我不能系统地重现它,所以它可能是代码的另一部分。如果有人在TN工作,请测试并告诉我。更新:我没有更新代码的问题,但请告诉我。
  3. 我如何把事情放在一起: 据我所知,最快的实现不在CRAN上,但源代码可以下载OSU stat。在我的基准测试中, msm trunco​​rm 中的竞争实现速度较慢。诀窍是有效地调整提案分布,其中指数很好地适用于截断的Normal的尾部。 所以我拿了Chris的代码,“Rcpp'ed”它并添加了一些openMP香料。动态调度在这里是最佳的,因为取样可以根据边界花费更多或更少的时间。 我发现一件令人讨厌的事情:当我想使用双打时,许多统计分布基于NumericVector类型。我只是编写了我的方式。

    继承人Rcpp代码:

    #include <Rcpp.h>
    #include <omp.h>
    
    
    // norm_rs(a, b)
    // generates a sample from a N(0,1) RV restricted to be in the interval
    // (a,b) via rejection sampling.
    // ======================================================================
    
    // [[Rcpp::export]]
    
    double norm_rs(double a, double b)
    {
       double  x;
       x = Rf_rnorm(0.0, 1.0);
       while( (x < a) || (x > b) ) x = norm_rand();
       return x;
    }
    
    // half_norm_rs(a, b)
    // generates a sample from a N(0,1) RV restricted to the interval
    // (a,b) (with a > 0) using half normal rejection sampling.
    // ======================================================================
    
    // [[Rcpp::export]]
    
    double half_norm_rs(double a, double b)
    {
       double   x;
       x = fabs(norm_rand());
       while( (x<a) || (x>b) ) x = fabs(norm_rand());
       return x;
    }
    
    // unif_rs(a, b)
    // generates a sample from a N(0,1) RV restricted to the interval
    // (a,b) using uniform rejection sampling. 
    // ======================================================================
    
    // [[Rcpp::export]]
    
    double unif_rs(double a, double b)
    {
       double xstar, logphixstar, x, logu;
    
       // Find the argmax (b is always >= 0)
       // This works because we want to sample from N(0,1)
       if(a <= 0.0) xstar = 0.0;
       else xstar = a;
       logphixstar = R::dnorm(xstar, 0.0, 1.0, 1.0);
    
       x = R::runif(a, b);
       logu = log(R::runif(0.0, 1.0));
       while( logu > (R::dnorm(x, 0.0, 1.0,1.0) - logphixstar))
       {
          x = R::runif(a, b);
          logu = log(R::runif(0.0, 1.0));
       }
       return x;
    }
    
    // exp_rs(a, b)
    // generates a sample from a N(0,1) RV restricted to the interval
    // (a,b) using exponential rejection sampling.
    // ======================================================================
    
    // [[Rcpp::export]]
    
    double exp_rs(double a, double b)
    {
      double  z, u, rate;
    
    //  Rprintf("in exp_rs");
      rate = 1/a;
    //1/a
    
       // Generate a proposal on (0, b-a)
       z = R::rexp(rate);
       while(z > (b-a)) z = R::rexp(rate);
       u = R::runif(0.0, 1.0);
    
       while( log(u) > (-0.5*z*z))
       {
          z = R::rexp(rate);
          while(z > (b-a)) z = R::rexp(rate);
          u = R::runif(0.0,1.0);
       }
       return(z+a);
    }
    
    
    
    
    // rnorm_trunc( mu, sigma, lower, upper)
    //
    // generates one random normal RVs with mean 'mu' and standard
    // deviation 'sigma', truncated to the interval (lower,upper), where
    // lower can be -Inf and upper can be Inf.
    //======================================================================
    
    // [[Rcpp::export]]
    double rnorm_trunc (double mu, double sigma, double lower, double upper)
    {
    int change;
     double a, b;
     double logt1 = log(0.150), logt2 = log(2.18), t3 = 0.725;
     double z, tmp, lograt;
    
     change = 0;
     a = (lower - mu)/sigma;
     b = (upper - mu)/sigma;
    
     // First scenario
     if( (a == R_NegInf) || (b == R_PosInf))
       {
         if(a == R_NegInf)
           {
         change = 1;
         a = -b;
         b = R_PosInf;
           }
    
         // The two possibilities for this scenario
         if(a <= 0.45) z = norm_rs(a, b);
         else z = exp_rs(a, b);
         if(change) z = -z;
       }
     // Second scenario
     else if((a * b) <= 0.0)
       {
         // The two possibilities for this scenario
         if((R::dnorm(a, 0.0, 1.0,1.0) <= logt1) || (R::dnorm(b, 0.0, 1.0, 1.0) <= logt1))
           {
         z = norm_rs(a, b);
           }
         else z = unif_rs(a,b);
       }
     // Third scenario
     else
       {
         if(b < 0)
           {
         tmp = b; b = -a; a = -tmp; change = 1;
           }
    
         lograt = R::dnorm(a, 0.0, 1.0, 1.0) - R::dnorm(b, 0.0, 1.0, 1.0);
         if(lograt <= logt2) z = unif_rs(a,b);
         else if((lograt > logt1) && (a < t3)) z = half_norm_rs(a,b);
         else z = exp_rs(a,b);
         if(change) z = -z;
       }
       double output;
       output = sigma*z + mu;
     return (output);
    }
    
    
    // rtnm( mu, sigma, lower, upper, cores)
    //
    // generates one random normal RVs with mean 'mu' and standard
    // deviation 'sigma', truncated to the interval (lower,upper), where
    // lower can be -Inf and upper can be Inf.
    // mu, sigma, lower, upper are vectors, and vectorized calls of this function
    // speed up computation
    // cores is an intege, representing the number of cores to be used in parallel
    //======================================================================
    
    
    // [[Rcpp::export]]
    
    Rcpp::NumericVector rtnm(Rcpp::NumericVector mus, Rcpp::NumericVector sigmas, Rcpp::NumericVector lower, Rcpp::NumericVector upper, int cores){
      omp_set_num_threads(cores);
      int nobs = mus.size();
      Rcpp::NumericVector out(nobs);
      double logt1 = log(0.150), logt2 = log(2.18), t3 = 0.725;
        double a,b, z, tmp, lograt;
    
         int  change;
    
      #pragma omp parallel for schedule(dynamic)   
      for(int i=0;i<nobs;i++) {  
    
         a = (lower(i) - mus(i))/sigmas(i);
         b = (upper(i) - mus(i))/sigmas(i);
         change=0;
         // First scenario
         if( (a == R_NegInf) || (b == R_PosInf))
           {
             if(a == R_NegInf)
               {
                  change = 1;
                  a = -b;
                  b = R_PosInf;
               }
    
             // The two possibilities for this scenario
             if(a <= 0.45) z = norm_rs(a, b);
             else z = exp_rs(a, b);
             if(change) z = -z;
           }
         // Second scenario
         else if((a * b) <= 0.0)
           {
             // The two possibilities for this scenario
             if((R::dnorm(a, 0.0, 1.0,1.0) <= logt1) || (R::dnorm(b, 0.0, 1.0, 1.0) <= logt1))
               {
                    z = norm_rs(a, b);
               }
             else z = unif_rs(a,b);
           }
    
         // Third scenario
         else
           {
             if(b < 0)
               {
                    tmp = b; b = -a; a = -tmp; change = 1;
               }
    
             lograt = R::dnorm(a, 0.0, 1.0, 1.0) - R::dnorm(b, 0.0, 1.0, 1.0);
             if(lograt <= logt2) z = unif_rs(a,b);
             else if((lograt > logt1) && (a < t3)) z = half_norm_rs(a,b);
             else z = exp_rs(a,b);
             if(change) z = -z;
           }
        out(i)=sigmas(i)*z + mus(i);          
      }
    
    return(out);
    }
    

    以下是基准:

    libs=c("truncnorm","msm","inline","Rcpp","RcppArmadillo","rbenchmark")
    if( sum(!(libs %in% .packages(all.available = TRUE)))>0){ install.packages(libs[!(libs %in% .packages(all.available = TRUE))])}
    for(i in 1:length(libs)) {library(libs[i],character.only = TRUE,quietly=TRUE)}
    
    
    #needed for openMP parallel
    Sys.setenv("PKG_CXXFLAGS"="-fopenmp")
    Sys.setenv("PKG_LIBS"="-fopenmp")
    
    #no of cores for openMP version
    cores = 4
    
    #surce code from same dir
    Rcpp::sourceCpp('truncnorm.cpp')
    
    
    #sample size
    nn=1000000
    
    
    bb= 100
    aa=-100
    benchmark( rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),cores), rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),1),rtnorm(nn,rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn)),rtruncnorm(nn, a=aa, b=100, mean = 0, sd = 1) , order="relative", replications=3    )[,1:4]
    
    aa=0 
    benchmark( rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),cores), rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),1),rtnorm(nn,rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn)),rtruncnorm(nn, a=aa, b=100, mean = 0, sd = 1) , order="relative", replications=3    )[,1:4]
    
    aa=2
    benchmark( rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),cores), rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),1),rtnorm(nn,rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn)),rtruncnorm(nn, a=aa, b=100, mean = 0, sd = 1) , order="relative", replications=3    )[,1:4]
    
    aa=50
    benchmark( rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),cores), rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),1),rtnorm(nn,rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn)),rtruncnorm(nn, a=aa, b=100, mean = 0, sd = 1) , order="relative", replications=3    )[,1:4]
    

    由于速度取决于上/下边界,因此需要进行多次基准测试。对于不同的情况,算法的不同部分都会出现。

1 个答案:

答案 0 :(得分:3)

非常快速的评论:

  1. 如果你包含RcppArmadillo.h,则不需要包含Rcpp.h - 事实上,你不应该,我们甚至会测试

  2. rep(oneDraw, n)进行n次通话。我会编写一个函数来调用一次返回你的绘制 - 它会更快,因为你节省了自己的n-1函数调用开销

  3. 您对许多统计分布的评论基于NumericVector类型,当我想使用双打时可能会发现一些误解:NumericVector是内部R类型的便捷代理类:无副本。您可以自由使用std::vector<double>或您喜欢的任何形式。

  4. 我对截断的法线知之甚少,所以我无法评论算法的具体细节。

  5. 一旦完成,请考虑Rcpp Gallery的帖子。