Question

我正在尝试优化从R调用的C子程序，它占用了我正在尝试解决的问题的约60％的计算时间。当纯粹用R编码时，这个值从86％下降。我的C代码中的绝大部分执行时间都是在嵌套的for循环中进行的，所以这似乎是尝试使用OpenMP进行并行化的明显候选者。我尝试过使用可变结果这样做 - 最好是经过的时间比不使用OMP要差一些，最坏的情况是性能与线程数成反比。最快版本的代码如下：

#include <R.h>
#include <Rmath.h>
#include <omp.h>
void gradNegLogLik_c(double *param, double *delta, double *X, double *M, int *nBeta, int *nEpsilon, int *nObs, double *gradient){
  // ========================================================================================
  // param:     double[nBeta + nEpsilon]        values of parameters at which to evaluate gradient
  // delta:     double[nObs]                    satellite - buoy differences
  // X:         double[nObs * (nBeta + nEpsilon)]  design matrix for mean components (i.e. beta terms)
  // M:         double[nObs * (nBeta + nEpsilon)]  design matrix for variance components (i.e. epsilon terms)
  // nBeta:     int                             number of mean terms
  // nEpsilon:  int                             number of variance terms
  // nObs:      int                             number of observations
  // gradient:  double[nBeta + nEpsilon]        output array of gradients
  // ========================================================================================

  // ========================================================================================
  // local variables
  size_t i, j, ind;
  size_t nterms = *nBeta + *nEpsilon;  
  size_t nbeta  = *nBeta;
  size_t nepsilon  = *nEpsilon;
  size_t nobs = *nObs; 
  // allocate local memory and set to zero
  double *sigma2        = calloc( nobs     , sizeof(double) );
  double *fittedValues  = calloc( nobs     , sizeof(double) );
  double *residuals     = calloc( nobs     , sizeof(double) );
  double *beta          = calloc( nbeta    , sizeof(double) );
  double *epsilon2      = calloc( nepsilon , sizeof(double) );  
  double *residuals2    = calloc( nobs     , sizeof(double) );
  double gradBeta, gradEpsilon;

  // extract beta and epsilon terms from param
  // =========================================
  for(i = 0 ; i < nbeta ; i++){
    beta[i]    = param[ i ];
    epsilon2[i] = param[ nbeta + i ];
  }

  // Initialise gradient to zero for return value
  // =========================================  
  for( i = 0 ; i < nterms ; i++){
    gradient[i] = 0;
  }

  // calculate sigma, fitted values and residuals
  // ============================================  
  for( i = 0 ; i < nbeta ; i++){
    for( j = 0 ; j < nobs ; j++){
      ind = i * nobs + j;
      sigma2[j] += M[ind] * epsilon2[i];
      fittedValues[j] += X[ind] * beta[i];
    }
  }

  for( j = 0 ; j < nobs ; j++){
    // calculate reciprocal as this is what we actually use and 
    // we only want to do it once.        
    sigma2[j] = 1 / sigma2[j]; 
    residuals[j] = delta[j] - fittedValues[j];    
    residuals2[j] = residuals[j]*residuals[j];
  } 

  // Loop over all observations and calculate value of (negative) derivative  
  // =======================================================================    
#pragma omp parallel for private(i, j, ind, gradBeta, gradEpsilon)\
  shared(gradient, nbeta, nobs, X, M, sigma2, fittedValues, delta, residuals2) \
  default(none) 
  for( i = 0 ; i < nbeta ; i++){
    gradBeta = 0.0;
    gradEpsilon = 0.0;
    for(j = 0 ; j < nobs ; j++){
      ind = i * nobs + j;
      gradBeta    -= -1.0*X[ind] * sigma2[j]*(fittedValues[j] - delta[j]);
      gradEpsilon -= 0.5*M[ind] * sigma2[j]*(residuals2[j] * sigma2[j] - 1);
    }
    gradient[i] = gradBeta;
    gradient[nbeta + i] = gradEpsilon;
  }

  // End of function
  // free local memory
  free(sigma2);
  free(fittedValues);
  free(residuals);
  free(beta);
  free(epsilon2);
  free(residuals2);
}

nObs是订单10000。

nBeta的范围是20 - 几百。

nEpsilon = nBeta，目前尚未使用。

搜索了这个网站，下午谷歌搜索和尝试不同的东西后，我似乎无法进一步改进。我的第一个想法是错误的共享 - 我尝试了各种各样的事情，例如展开外部循环，一次设置8个渐变[]元素，以创建一个临时填充数组来存储结果。我也尝试了不同的组合共享，私人和第一私人。这似乎没有改善的东西，我的最快执行时间在并行上比在串行中稍差。在我花费更多时间之前，这会导致两个问题：

我的问题（重复约9000次相同的计算20 - 900次）太小而不值得使用OMP吗？
我有什么遗漏或做错了吗？

我怀疑它是后者，因为我在使用C和OMP时相对缺乏经验。任何帮助/想法将不胜感激。

（有关信息，我在SLED11服务器上运行，具有16个内核和192GB内存，并使用GCC 4.7.2编译我的C代码）。其他用户正在使用服务器，但OMP与串行代码的相对性能似乎独立于其他用户。

提前致谢，

戴夫。

编辑：有关信息，我使用的编译命令是

gcc -I/RHOME/R/3.0.1/lib64/R/include -DNDEBUG -I/usr/local/include -fpic \
    -std=c99 -Wall -pedantic –O3 -fopenmp -c src/gradNegLogLik_call.c \
     -o src/gradNegLogLik_call.o

大多数标志都是由R CMD SHLIB命令设置的 - 我手动添加了-O3 -fopenmp。

Answer 1

在给出我为加快代码所做的工作的答案之前，给我上面的问题提供一些背景可能是有用的（尽管这已经在不使用OMP的情况下实现了）。

我编写了原始的C函数来计算与R optim（）命令和L-BFGS-B方法一起使用的对数似然函数的梯度。对于每次调用优化，我的对数似然和梯度函数都被调用~100次，因为优化找到最佳解决方案。因此，正如Rprof所预期和报告的那样，这两个函数占用了我的大部分执行时间，因此转换为C以提高代码效率的两个目标也是如此。

将我的两个函数转换为C并优化该代码导致我的调用优化从平均每次调用1.88秒减少到每次调用0.25秒。这使我的处理时间从大约1个月减少到几天。影响最大的变化（除了调用C）改变了嵌套循环的顺序。选择原始顺序是由于R存储矩阵的方式，并选择避免在我的C函数的每次调用时必须转置我的矩阵。认识到每次调用optim（）只需要进行一次转置，而不是像我最初编码的每次C调用一样，与改变C函数中的顺序的影响/好处相比，这是一个很小的开销。。

鉴于速度的提高，必须花费更多时间来证明这一点。我的渐变函数的最终版本（根据我的原始帖子）如下所示。

请注意，虽然我已经从使用.C更改为.Call（因此更改为函数参数等），但这本身并不能解释速度增加。

#include <R.h>
#include <Rmath.h>
#include <Rinternals.h>
#include <omp.h>
SEXP gradNegLogLik_call(SEXP param ,SEXP delta, SEXP X, SEXP M, SEXP nBeta, SEXP nEpsilon){

  // local variables
  double *par, *d;
  double *sigma2, *fittedValues, *residuals, *grad, *Xuse, *Muse;
  double val, sig2, gradBeta, gradEpsilon;
  int n, m, ind, nterms, i, j;

  SEXP gradient;

  // get / associate parameters with local pointer
  par = REAL(param);
  Xuse = REAL(X);
  Muse = REAL(M);
  d = REAL(delta);
  n = LENGTH(delta);
  m = INTEGER(nBeta)[0];
  nterms = m + m;


  // allocate memory
  PROTECT( gradient = allocVector(REALSXP, nterms ));
  // set pointer to real portion of gradient
  grad = REAL(gradient);

  // set all gradient terms to zero
  for(i = 0 ; i < nterms ; i++){
      grad[i] = 0.0;
  }


  sigma2 = Calloc(n, double );
  fittedValues = Calloc(n, double );
  residuals = Calloc(n, double );

  // calculate sigma, fitted values and residuals
  for(i = 0 ; i < n ; i++){
    val = 0.0;
    sig2 = 0.0;
    for(j = 0 ; j < m ; j++){
      ind = i*m + j;
      val +=  Xuse[ind]*par[j];
      sig2 += Muse[ind]*par[j+m];
    }
    // calculate reciprocal of sigma as this is what we actually use
    // and we only want to do it once
    sigma2[i] = 1.0 / sig2;
    fittedValues[i] = val;
    residuals[i] = d[i] - val;
  }


  // now loop over each paramter and calculate derivative
  for(i = 0 ; i < n ; i++){
    gradBeta = -1.0*sigma2[i]*(fittedValues[i] - d[i]);
    gradEpsilon = 0.5*sigma2[i]*(residuals[i]*residuals[i]*sigma2[i] - 1);     
    for(j = 0 ; j < m ; j++){
        ind = i*m + j;
        grad[j] -= Xuse[ind]*gradBeta;
        grad[j+m] -= Muse[ind]*gradEpsilon;
    }            
  }

  UNPROTECT(1);
  Free(sigma2);
  Free(residuals);
  Free(fittedValues);
  // return array of gradients  
  return gradient;
}

慢速OMP与串行

1 个答案: