R data.table:跨自定义运行函数 - 非不同子集

时间:2018-04-15 17:25:32

标签: r data.table subset benchmarking adjacency-matrix

我正在寻找一种更有效的面向数据的方法来实现我目前正在使用的for循环。

我有一个data.table,其中包含一个由发送者,接收者,领带指示符和感兴趣的变量组成的边缘列表:

library(data.table)
#Create data
  set.seed(1)
  dt<-data.table(
    id1=rep(letters,each=length(letters)),
    id2=rep(letters,length(letters)),
    tie=rbinom(length(letters)^2,1,.1),
    interest=abs(rnorm(n=length(letters)^2))
  )
  dt$tie[dt$id1==dt$id2]<-1

我想根据每个id1的变化(id2 where tie == 1)以及这些变更之间的关系得出一些汇总统计数据。也就是说,我根据每个id改变之间的邻接矩阵推导出摘要统计。我将这些统计数据放在以下向量中。

#Initialize summary statistics
  sum.interest.lessthan1<-vector()
  sum.interest.lessthan1.weighted<-vector()
  rank.sum.interest.lessthan1<-vector()
  rank.mean<-vector()

要获取这些摘要统计信息,我目前运行以下循环:

for(i in 1:length(unique(dt$id1))){
#1) Produce vector of alters
  alters<-dt$id2[dt$id1==dt$id2[i] & dt$tie==1]
#2) Create a datatable containing all info among alters
  tempdt<-dt[dt$id1%in%alters & dt$id2%in%alters,]
#3) Skip if no ties other than to self
   if(nrow(tempdt)==1){
     next()
   }
#4) Get summary statistics
  #Number of alters with interest <1
      sum.interest.lessthan1[i]<-sum(tempdt[tempdt$id1==dt$id2[i] & tempdt$id2!=dt$id2[i] ]$interest<1)
  #Number of alters with interest <1, weighted by mean interest
      sum.interest.lessthan1.weighted[i]<-sum(tempdt[tempdt$id1==dt$id2[i] & tempdt$id2!=dt$id2[i]]$interest/mean(tempdt$interest)<1)
  #Ego rank number of alters with interest <1 among all alters
      tempstat<-tempdt[tempdt$id1!=tempdt$id2,.(suminterest=sum(interest<1)),by="id1"]
      rank.sum.interest.lessthan1[i]<-((rank(tempstat$suminterest)-1)/(length(tempstat$suminterest)-1))[which(tempstat$id1==dt$id2[i])]
  #Ego rank mean interest among all alters
      tempstat<-tempdt[tempdt$id1!=tempdt$id2,.(meaninterest=mean(interest)),by="id1"]
      rank.mean[i]<-((rank(tempstat$meaninterest)-1)/(length(tempstat$meaninterest)-1))[which(tempstat$id1==dt$id2[i])]
}

有没有办法在不依赖循环的情况下更有效地推导出这些统计数据?我的实际数据集由数千个不同的ID和多种类型的关系组成,因此通常需要数小时才能完成。提前感谢任何建议!我的直觉是使用data.table&#34; by&#34;语法,但我想不出如何构建一个&#34; by group&#34;这代表我的子集。

布赖恩

1 个答案:

答案 0 :(得分:0)

Per chinsoon12的建议,下面是如何回答上述问题:

#create data table of all edges
    altersDT <- dt[tie==1, .(alter=id2), by=.(grp=id1)]
#create data table listing edges among all ties among ego and ego's alters
    altersDT<-altersDT[, CJ(alter, alter), by=.(grp)]
#Merge the tie data with the edge attribute data 
    altersDT <- dt[altersDT, on=.(id1=V1, id2=V2)]
#produce summary statistics
    #sumint and sumw
        altersDT[, c("sumint", "sumw") := {list(
            sum(interest[id1==grp & id1!=id2]<1),
            sum(interest[id1==grp & id1!=id2]/mean(interest)<1)
        )}, by=.(grp)]
    #ranksum and rankmean
        #create tempstats
            altersDT[, c("ranksum", "rankmean") := {list(
              as.numeric(sum(interest[id1!=id2]<1)),
              as.numeric(mean(interest[id1!=id2]))
            )}, by=.(grp,id1)]
        #derive stats of interest
            altersDT[, c("ranksum", "rankmean") := {list(
              (rank(ranksum)-1)/(length(ranksum)-1),
              (rank(rankmean)-1)/(length(rankmean)-1)
            )}, by=.(grp,id2)]
#Put results into key variables
    subcrit<-altersDT$id1==altersDT$id2 & altersDT$id1==altersDT$grp
    new.sum.interest.lessthan1<-altersDT$sumint[subcrit]
    new.sum.interest.lessthan1.weighted<-altersDT$sumw[subcrit]
    new.rank.sum.interest.lessthan1<-altersDT$ranksum[subcrit]
    new.rank.mean<-altersDT$rankmean[subcrit]

#check that results are the same as old
    table(new.sum.interest.lessthan1==sum.interest.lessthan1)
    table(new.sum.interest.lessthan1.weighted==sum.interest.lessthan1.weighted)
    table(new.rank.sum.interest.lessthan1==rank.sum.interest.lessthan1)
    table(new.rank.mean==rank.mean)