更快地拆分数据框

时间:2017-07-26 12:23:11

标签: r dataframe split

我创建了一个函数,我使用了split函数,但需要很长时间才能得到结果:

st=c(0 ,0, 9,39,44 ,100, 0, 0, 8,26 ,100, 0, 0, 6, 9,16,20,24,29,35,37,47,54,73 ,100, 0, 0, 6,35,44 ,100, 0, 0,10,16,27,40,51,91, 100, 0, 0,3, 7,28,69,71,75, 100, 0, 0,19 ,100, 0, 0, 7,24,29,35 ,100, 0, 0, 8,11,14,15,18,31,32,33,50,53,56,62,79,80,82,87,88,89, 100, 0, 0, 2,7,31,34,40 ,100, 0, 0,10,41,51,76 ,100, 0, 0, 4,32,41,46 ,100, 0, 0,19,26,59,76,83,88,92 ,100, 0, 0,11,27,51, 100, 0, 0, 5, 7,45,56,78,3 ,100, 0, 0, 3,12,23,46,53,72 ,100)

int=c(0.00,3.52 ,11.94,1.78 ,22.00,0.00,0.00,5.85 ,14.26 ,56.65,0.00,0.00,4.52,2.76,4.89,3.17,3.36,3.67,4.49,1.97,7.47,5.55, 14.79 ,20.78,0.00,0.00,4.51 ,20.71,6.60 ,40.08,0.00,0.00 ,11.28,7.30 , 12.14 ,14.01, 12.82 ,45.65,9.97,0.00,0.00,2.33,3.72 ,19.55, 37.61,1.72,3.56 ,23.05,0.00,0.00 ,13.51 ,57.64,0.00,0.00,4.74 ,11.42,3.51,4.38 ,43.83,0.00,0.00,5.66,2.35,1.62,1.09,2.05,8.76,0.63,1.05, 11.65,2.34,1.82,4.78, 11.41,1.10,1.52,3.41,0.61,1.01,7.41,0.00,0.00,2.09,3.72,21.57,2.69,5.65 ,53.43,0.00,0.00,3.77 ,12.05,3.85,9.88,9.13,0.00,0.00,3.32 ,20.97,6.61,3.47 ,40.62,0.00,0.00,3.26,1.27,5.71,2.94,1.13,0.89,0.78,1.31,0.00,0.00,4.91,7.03 ,10.14 ,21.36,0.00,0.00,4.16,2.22 ,33.84 ,10.72, 19.17 ,13.68,6.49,0.00,0.00,1.83,5.22,6.95, 13.92,4.04, 11.66 ,17.04,0.00)

id=c(1:length(st))

Attr=c("sta","a",  "cr","a",  "hf", "sp", "sta","hf", "cr",
       "a",  "sp", "sta","a",  "ac","a",  "hf" ,"cr","a",
       "ac","a",  "sl", "cr","a",  "pq","sp", "sta","a",
       "sl", "cr","hf" ,"sp", "sta","a",  "cr","sl", "hf",
       "a",  "pq","hf", "sp", "sta","cr","a",  "hf", "sl",
       "cr","hf" ,"a",  "sp", "sta","hf" ,"cr","sp", "sta",
       "hc","cr","hf", "sl", "a",  "sp", "sta","hf", "a",
       "cr","hf" ,"a",  "cr","hf", "a",  "cr","hf", "a",
       "hf", "cr","hf" ,"a",  "cr","hf", "a",  "cr","sp",
       "sta","cr","a",  "hf", "a",  "cr","hf" ,"sp", "sta",
       "sl", "a",  "hf" ,"cr","a",  "sp", "sta","a",  "ac",
       "sl", "hf" ,"cr","sp", "sta","hc","pv","a",  "hf",
       "a",  "pv","hc","sl", "sp", "sta","hf", "a",  "cr",
       "sl", "sp", "sta","hf", "a",  "cr","a",  "a",  "sl",
       "a",  "sp", "sta","cr","hf" ,"a",  "sl", "cr","a","hf" ,"sp")
p=replicate(length(Attr),sample(1:3,1,replace=T))
data=cbind.data.frame(id,st,int,Attr,p) 


 si<-function(data,...){
    ff<-list()
    library(MASS)
    library(Hmisc)
    att<-function(data,...){
      d=data
      f=list()
      z=list()

      f=split(data, data$Attr,drop=T)
      z=lapply(f,function(x){if(nrow(x)> 1){fitdistr(as.integer(x$int),"Negative Binomial")}})
      z=z[!sapply(z, is.null)]
      return(z)
    }
    data$p=as.factor(data$p)

    datap=list()
    d=list()
    s=list()
      for(i in 1:3){
        datap[[i]]=data[data$p==i,]
        d[[i]]=subset(datap[[i]],int != 0)
      }

    s=lapply(d,att)

    return(s)
  }

我必须使用这个功能4000次:

system.time(a<-replicate(4000,si(data)))
utilisateur     système      écoulé 
     110.02        1.01      111.33

所以我的问题是,是否有其他替代方法可以更快地拆分数据并加快功能执行时间

2 个答案:

答案 0 :(得分:3)

你的功能太复杂了,我把它简化了一点但是没有多少时间。像其他人所说的那样,花费的时间最多的是fitdistr,然后是densfun,然后是.Call。这些都在fitdistr的调用中,因此无法进行优化。 (我使用了Federico Manigrasso的分析代码。) 首先,我将library调用放在代码的开头,而不是函数内部。我还改变了你创建data.frame的方式。

library(MASS)
library(Hmisc)

data <- data.frame(id,st,int,Attr,p) 

si2 <- function(data,...){
    att<-function(data,...){
      z=lapply(split(data, data$Attr,drop=T), function(x){
          if(nrow(x)> 1) fitdistr(as.integer(x$int),"Negative Binomial")
      })
      z[!sapply(z, is.null)]
    }

    inx <- data$int != 0
    lapply(lapply(1:3, function(i) data[data$p==i & inx,]), att)
}

system.time(a<-replicate(4000,si(data)))
   user  system elapsed 
  89.40    0.00   89.73 
system.time(b<-replicate(4000,si2(data)))
   user  system elapsed 
  84.21    0.03   84.33 
identical(a, b)
[1] TRUE

答案 1 :(得分:2)

嗨,试试这个来分析你的代码

Rprof(interval = 0.0001)
si(data)
Rprof(NULL)
siprof <- summaryRprof()$by.self
siprof$Fun <- rownames(siprof)
head(siprof)
head(siprof[order(siprof$self.time, decreasing = TRUE),])
sum(siprof$self.time) 

在我的情况下,花费时间的是实际上第一次运行代码时加载库

sum(siprof$self.time)  was equal to  29.276

the second time 
sum(siprof$self.time)  was equal to  3.368

你也可以在head(siprof)中看到不同的功能,需要更长时间的功能与库的附件有关

第一次

    self.time self.pct total.time total.pct               Fun
"lazyLoadDBfetch"     5.333    18.21      5.562     18.99 "lazyLoadDBfetch"
"gzfile"              2.108     7.20      2.121      7.24          "gzfile"
".getGeneric"         1.451     4.96      1.994      6.81     ".getGeneric"
"getClassDef"         1.133     3.87      2.024      6.91     "getClassDef"
"file.exists"         1.126     3.85      1.176      4.02     "file.exists"
"FUN"                 0.863     2.95      5.513     18.83             "FUN"

第二次

  self.time self.pct total.time total.pct             Fun
"parse"             0.326     9.67      0.486     14.41         "parse"
"fitdistr"          0.250     7.41      1.905     56.54      "fitdistr"
"print.default"     0.173     5.13      0.173      5.13 "print.default"
"paste"             0.160     4.75      0.193      5.74         "paste"
"densfun"           0.135     4.00      0.227      6.73       "densfun"
"match"             0.110     3.26      0.126      3.73         "match"

PS使用

microbenchmark::microbenchmark(a<-si(data),times=4000)

更好地评估你的表达

所以在一天结束的时候,你的代码可能会从风格的角度来改进,但是在计算时间方面没有多少收获