我尝试使用Rcpp和Openmp来加速我的代码。这是我的cpp代码。我想知道为什么。通过openmp加速此代码的最佳方法是什么。
// #include <Rcpp.h>
#include <vector>
#include <string.h>
#include <RcppArmadillo.h>
#include "omp.h"
using namespace Rcpp;
using namespace std;
// Function subset("[.data.frame");
// [[Rcpp::plugins(openmp) ]]
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
DataFrame reformdata(DataFrame rawfile, DataFrame genefile){
vector<string> rawchr = rawfile["chr"];
NumericVector rawpos = rawfile["start"];
vector<string> genechr = genefile["X.1"];
NumericVector genestart = genefile["TSS.start"];
NumericVector geneend = genefile["TSS.end"];
vector<string> geneID = genefile["X"];
NumericVector rawnumCs = rawfile["numCs"];
NumericVector rawnumTs = rawfile["numTs"];
NumericVector rawmethyl = rawfile["methyl"];
int n_raw = rawchr.size();
int n_gene = genechr.size();
int i = 0,j = 0;
vector<string> outputgeneID;
vector<string> outputchr;
NumericVector outputstart;
NumericVector outputend;
NumericVector outputmethyl;
NumericVector outputnumCs;
NumericVector outputnumTs;
#pragma omp parallel for num_threads(8)
for(i = 0; i < n_gene; i++){
string loc_gene_name = genechr[i];
int gene_start = genestart[i];
int gene_end = geneend[i];
for(j = 0;j < n_raw; j++){
string raw_name = rawchr[j];
int raw_pos = rawpos[j];
if(raw_name.compare(loc_gene_name)==0&&raw_pos >= gene_start&&raw_pos <= gene_end){
#pragma omp critical
{
outputgeneID.push_back(geneID[i]);
outputchr.push_back(rawchr[j]);
outputstart.push_back(rawpos[j]);
outputend.push_back(rawpos[j]);
outputmethyl.push_back(rawmethyl[j]);
outputnumCs.push_back(rawnumCs[j]);
outputnumTs.push_back(rawnumTs[j]);
}
}
}
}
return DataFrame::create(Named("geneID")=outputgeneID,Named("chr")=outputchr,
Named("start")=outputstart,Named("end")=outputend,
Named("methyl")=outputmethyl,
Named("numCs")=outputnumCs,Named("numTs")=outputnumTs);
}
我只想在R中输入两个Dataframe,然后在这两个数据帧之间进行匹配。也许push_back就是问题所在。有没有一种简单的方法可以避免它?我正在处理大数据,速度很重要。