我正在寻找一种更有效的面向数据的方法来实现我目前正在使用的for循环。
我有一个data.table,其中包含一个由发送者,接收者,领带指示符和感兴趣的变量组成的边缘列表:
library(data.table)
#Create data
set.seed(1)
dt<-data.table(
id1=rep(letters,each=length(letters)),
id2=rep(letters,length(letters)),
tie=rbinom(length(letters)^2,1,.1),
interest=abs(rnorm(n=length(letters)^2))
)
dt$tie[dt$id1==dt$id2]<-1
我想根据每个id1的变化(id2 where tie == 1)以及这些变更之间的关系得出一些汇总统计数据。也就是说,我根据每个id改变之间的邻接矩阵推导出摘要统计。我将这些统计数据放在以下向量中。
#Initialize summary statistics
sum.interest.lessthan1<-vector()
sum.interest.lessthan1.weighted<-vector()
rank.sum.interest.lessthan1<-vector()
rank.mean<-vector()
要获取这些摘要统计信息,我目前运行以下循环:
for(i in 1:length(unique(dt$id1))){
#1) Produce vector of alters
alters<-dt$id2[dt$id1==dt$id2[i] & dt$tie==1]
#2) Create a datatable containing all info among alters
tempdt<-dt[dt$id1%in%alters & dt$id2%in%alters,]
#3) Skip if no ties other than to self
if(nrow(tempdt)==1){
next()
}
#4) Get summary statistics
#Number of alters with interest <1
sum.interest.lessthan1[i]<-sum(tempdt[tempdt$id1==dt$id2[i] & tempdt$id2!=dt$id2[i] ]$interest<1)
#Number of alters with interest <1, weighted by mean interest
sum.interest.lessthan1.weighted[i]<-sum(tempdt[tempdt$id1==dt$id2[i] & tempdt$id2!=dt$id2[i]]$interest/mean(tempdt$interest)<1)
#Ego rank number of alters with interest <1 among all alters
tempstat<-tempdt[tempdt$id1!=tempdt$id2,.(suminterest=sum(interest<1)),by="id1"]
rank.sum.interest.lessthan1[i]<-((rank(tempstat$suminterest)-1)/(length(tempstat$suminterest)-1))[which(tempstat$id1==dt$id2[i])]
#Ego rank mean interest among all alters
tempstat<-tempdt[tempdt$id1!=tempdt$id2,.(meaninterest=mean(interest)),by="id1"]
rank.mean[i]<-((rank(tempstat$meaninterest)-1)/(length(tempstat$meaninterest)-1))[which(tempstat$id1==dt$id2[i])]
}
有没有办法在不依赖循环的情况下更有效地推导出这些统计数据?我的实际数据集由数千个不同的ID和多种类型的关系组成,因此通常需要数小时才能完成。提前感谢任何建议!我的直觉是使用data.table&#34; by&#34;语法,但我想不出如何构建一个&#34; by group&#34;这代表我的子集。
布赖恩
答案 0 :(得分:0)
Per chinsoon12的建议,下面是如何回答上述问题:
#create data table of all edges
altersDT <- dt[tie==1, .(alter=id2), by=.(grp=id1)]
#create data table listing edges among all ties among ego and ego's alters
altersDT<-altersDT[, CJ(alter, alter), by=.(grp)]
#Merge the tie data with the edge attribute data
altersDT <- dt[altersDT, on=.(id1=V1, id2=V2)]
#produce summary statistics
#sumint and sumw
altersDT[, c("sumint", "sumw") := {list(
sum(interest[id1==grp & id1!=id2]<1),
sum(interest[id1==grp & id1!=id2]/mean(interest)<1)
)}, by=.(grp)]
#ranksum and rankmean
#create tempstats
altersDT[, c("ranksum", "rankmean") := {list(
as.numeric(sum(interest[id1!=id2]<1)),
as.numeric(mean(interest[id1!=id2]))
)}, by=.(grp,id1)]
#derive stats of interest
altersDT[, c("ranksum", "rankmean") := {list(
(rank(ranksum)-1)/(length(ranksum)-1),
(rank(rankmean)-1)/(length(rankmean)-1)
)}, by=.(grp,id2)]
#Put results into key variables
subcrit<-altersDT$id1==altersDT$id2 & altersDT$id1==altersDT$grp
new.sum.interest.lessthan1<-altersDT$sumint[subcrit]
new.sum.interest.lessthan1.weighted<-altersDT$sumw[subcrit]
new.rank.sum.interest.lessthan1<-altersDT$ranksum[subcrit]
new.rank.mean<-altersDT$rankmean[subcrit]
#check that results are the same as old
table(new.sum.interest.lessthan1==sum.interest.lessthan1)
table(new.sum.interest.lessthan1.weighted==sum.interest.lessthan1.weighted)
table(new.rank.sum.interest.lessthan1==rank.sum.interest.lessthan1)
table(new.rank.mean==rank.mean)