Question

@ user3759195写了一篇关于RStudio与RCpp崩溃的帖子https://stackoverflow.com/questions/24322356/rstudio-crashes-and-it-does-not-reproduce，但没有给出任何可重现的案例。 @KevinUshey在评论中提到我们必须PROTECT代码中的wrap。

我冒昧地发布了用RCpp编写的split.data.frame函数的两种替代方法：

*版本不会破坏RSTUDIO *

//[[Rcpp::export]]
List splitDataFrameCpp(DataFrame x,NumericVector y) {
  int nRows=x.nrows();
  int nCols=x.size();

  std::map<double,vector<double> > z;
  for (int i=0;i<nCols;i++) {
    std::vector<double> tmp=Rcpp::as<std::vector<double> > (x[i]);
    for (int j=0;j<nRows;j++) {
      z[y[j]].push_back(tmp[j]);      
    }
  }

  std::vector<double> yunq=Rcpp::as<std::vector<double> > (sort_unique(y));
  std::map<double, DataFrame> z1;
  for (int i=0;i<int(yunq.size());i++) {
    NumericVector tmp1=wrap(z[yunq[i]]);   // *** DEFINING INSIDE LOOP ***
    tmp1.attr("dim")=Dimension(int(tmp1.size())/nCols,nCols);
    DataFrame tmp2(wrap(tmp1));   // *** DEFINING INSIDE LOOP ***
    tmp2.attr("names")=x.attr("names");
    z1[yunq[i]]=tmp2;
  }
  return wrap(z1);  
}

*破坏RSTUDIO的版本*

//[[Rcpp::export]]
List splitDataFrameCpp(DataFrame x,NumericVector y) {
  int nRows=x.nrows();
  int nCols=x.size();

  std::map<double,vector<double> > z;
  for (int i=0;i<nCols;i++) {
    std::vector<double> tmp=Rcpp::as<std::vector<double> > (x[i]);
    for (int j=0;j<nRows;j++) {
      z[y[j]].push_back(tmp[j]);      
    }
  }

  std::vector<double> yunq=Rcpp::as<std::vector<double> > (sort_unique(y));
  std::map<double, DataFrame> z1;

  NumericVector tmp1;    // *** DEFINING OUTSIDE LOOP ***
  DataFrame tmp2;    // *** DEFINING OUTSIDE LOOP ***

  for (int i=0;i<int(yunq.size());i++) {
    tmp1=wrap(z[yunq[i]]);
    tmp1.attr("dim")=Dimension(int(tmp1.size())/nCols,nCols);
    tmp2=wrap(tmp1);
    tmp2.attr("names")=x.attr("names");
    z1[yunq[i]]=tmp2;
  }    
  return wrap(z1);      
}

两个代码之间的主要区别在于，在一种情况下tmp1和tmp2在循环内定义，而在另一种情况下在循环外定义。

任何人都可以解释为什么第二个循环崩溃（以及什么可以改为NOT使RStudio崩溃）？我仍然是C ++的新手，主要通过查看SO或RCpp画廊网站上的示例来编写RCpp - 所以我想更多地理解这种行为。
此外，作为附带好处，如果任何人都可以推荐更改以使代码更快，那将是很好的。根据我使用的一些测试用例，不崩溃的代码目前比R split.data.frame函数快2到3倍。

测试用例示例：

> testDF
   V1 V2 V3 V4 V5 V6
1   1  5  4  1  3  2
2   2  1  5  4  1  3
3   2  2  1  5  4  1
4   3  2  2  1  5  4
5   1  3  2  2  1  5
6   4  1  3  2  2  1
7   1  5  4  1  3  2
8   2  1  5  4  1  3
9   2  2  1  5  4  1
10  3  2  2  1  5  4
11  1  3  2  2  1  5
12  4  1  3  2  2  1

> testSp<-c(1,1,1,2,2,2,3,4,4,3,3,5)

> split(testDF,testSp)     OR  > splitDataFrameCpp(testDF,testSp)     
$`1`
  V1 V2 V3 V4 V5 V6
1  1  5  4  1  3  2
2  2  1  5  4  1  3
3  2  2  1  5  4  1

$`2`
  V1 V2 V3 V4 V5 V6
4  3  2  2  1  5  4
5  1  3  2  2  1  5
6  4  1  3  2  2  1

$`3`
   V1 V2 V3 V4 V5 V6
7   1  5  4  1  3  2
10  3  2  2  1  5  4
11  1  3  2  2  1  5

$`4`
  V1 V2 V3 V4 V5 V6
8  2  1  5  4  1  3
9  2  2  1  5  4  1

$`5`
   V1 V2 V3 V4 V5 V6
12  4  1  3  2  2  1

此测试用例的microbenchmark结果：

> microbenchmark(t1<-split(testDF,testSp),t2<-splitDataFrameCpp(testDF,testSp))
Unit: microseconds
                                   expr     min      lq   median       uq      max neval
             t1 <- split(testDF, test2) 343.181 365.562 372.8760 387.9430 1027.786   100
 t2 <- splitDataFrameCpp(testDF, test2) 177.881 190.315 200.5545 208.4545  870.093   100

*编辑*

添加了sessionInfo：

> sessionInfo()
R version 3.1.0 (2014-04-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] microbenchmark_1.3-0

loaded via a namespace (and not attached):
[1] Rcpp_0.11.1 tools_3.1.0

此外，testDF在R中创建为numeric，而不是integer。

Answer 1

对于它的价值，这是一个完整的例子，你可以sourceCpp()。和凯文和罗曼所说的相似，它对我来说也不会爆炸。

#include <Rcpp.h>

using namespace Rcpp;
using namespace std;

//[[Rcpp::export]]
List splitDataFrameCppA(DataFrame x,NumericVector y) {
  int nRows=x.nrows();
  int nCols=x.size();

  std::map<double,vector<double> > z;
  for (int i=0;i<nCols;i++) {
    std::vector<double> tmp=Rcpp::as<std::vector<double> > (x[i]);
    for (int j=0;j<nRows;j++) {
      z[y[j]].push_back(tmp[j]);      
    }
  }

  std::vector<double> yunq=Rcpp::as<std::vector<double> > (sort_unique(y));
  std::map<double, DataFrame> z1;
  for (int i=0;i<int(yunq.size());i++) {
    NumericVector tmp1=wrap(z[yunq[i]]);   // *** DEFINING INSIDE LOOP ***
    tmp1.attr("dim")=Dimension(int(tmp1.size())/nCols,nCols);
    DataFrame tmp2(wrap(tmp1));   // *** DEFINING INSIDE LOOP ***
    tmp2.attr("names")=x.attr("names");
    z1[yunq[i]]=tmp2;
  }
  return wrap(z1);  
}


//[[Rcpp::export]]
List splitDataFrameCppB(DataFrame x,NumericVector y) {
  int nRows=x.nrows();
  int nCols=x.size();

  std::map<double,vector<double> > z;
  for (int i=0;i<nCols;i++) {
    std::vector<double> tmp=Rcpp::as<std::vector<double> > (x[i]);
    for (int j=0;j<nRows;j++) {
      z[y[j]].push_back(tmp[j]);      
    }
  }

  std::vector<double> yunq=Rcpp::as<std::vector<double> > (sort_unique(y));
  std::map<double, DataFrame> z1;

  NumericVector tmp1;    // *** DEFINING OUTSIDE LOOP ***
  DataFrame tmp2;    // *** DEFINING OUTSIDE LOOP ***

  for (int i=0;i<int(yunq.size());i++) {
    tmp1=wrap(z[yunq[i]]);
    tmp1.attr("dim")=Dimension(int(tmp1.size())/nCols,nCols);
    tmp2=wrap(tmp1);
    tmp2.attr("names")=x.attr("names");
    z1[yunq[i]]=tmp2;
  }    
  return wrap(z1);      
}


/*** R

testDF <- read.table(textConnection("
1  5  4  1  3  2
2  1  5  4  1  3
2  2  1  5  4  1
3  2  2  1  5  4
1  3  2  2  1  5
4  1  3  2  2  1
1  5  4  1  3  2
2  1  5  4  1  3
2  2  1  5  4  1
3  2  2  1  5  4
1  3  2  2  1  5
4  1  3  2  2  1
"))

testSp <- c(1,1,1,2,2,2,3,4,4,3,3,5)


str(splitDataFrameCppA(testDF, testSp))
str(splitDataFrameCppB(testDF, testSp))

library(microbenchmark)
microbenchmark(split(testDF,testSp),
               splitDataFrameCppA(testDF,testSp),
               splitDataFrameCppB(testDF,testSp))

*/

基准测试甚至介于两个版本之间：

R> library(microbenchmark)

R> microbenchmark(split(testDF,testSp),
+                splitDataFrameCppA(testDF,testSp),
+                splitDataFrameCppB(testDF,testSp))
Unit: microseconds
                               expr     min      lq  median      uq      max neval
              split(testDF, testSp) 687.271 724.748 745.287 791.574 2373.283   100
 splitDataFrameCppA(testDF, testSp) 380.781 393.161 406.686 421.469  491.803   100
 splitDataFrameCppB(testDF, testSp) 377.959 393.391 405.476 429.947 2052.193   100
R> 
R>

RStudio与RCpp崩溃并具有可重现的代码

1 个答案: