我试图找出连接交易。从第一个TRUE到最后一个TRUE,它被视为一个事务,并且在事务中找到tpt_mode是混合还是纯。然后,插入一个包含新数据的新列但当前现在for循环正在处理少量数据,当涉及到大量数据时,它往往运行得非常慢。如何优化for循环以加快性能?
firstid<-1
currTpt <- 'NA'
count<-0
n <- nrow(tnx)
for (i in 1:n) {
if(tnx$FIRST[i]){
firstid<-i
currTpt <-tnx$mode[i]
count <-1
}
else{
count <- count + 1
}
if(as.character(tnx$mode[i])!= as.character(currTpt)){
currTpt <- 'both'
}
if(tnx$LAST[i])
{
tnx$final_end_loc[firstid]<-tnx$end_loc[i]
tnx$final_end_date[firstid]<-as.character(tnx$end_date[i])
tnx$final_end_time[firstid]<-as.character(tnx$end_time[i])
tnx$final_mode[firstid]<-as.character(currTpt)
tnx$final_count[firstid] <- count
}
}
final_tnx<-subset(tnx,FIRST==TRUE,c("id","start_date","start_time","final_end_date","final_end_time","start_loc","final_end_loc","final_mode","final_count"))
示例数据:编辑
tnx<- data.frame(
id=c("A","A","A","A","C","C","D","D","E"),
mode=c("on","on","off","on","on","off","off","off","on"),
start_time=c("8:20:22","17:20:22","17:45:22","18:20:22","16:35:22","17:20:22","15:20:22","16:00:22","12:20:22"),
end_time=c("8:45:22","17:30:22","18:00:22","18:30:22","17:00:22","17:50:22","15:45:22","16:14:22","27:50:22"),
start_loc=c("12","12","207","12","11","65","222","32","12"),
end_loc=c(31,31,29,11,22,12,45,31,11),
start_date=c("6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012"),
end_date=c("6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012","6/3/2012"),
FIRST=c(T,T,F,F,T,F,T,F,T),
LAST=c(T,F,F,T,F,T,F,T,T)
)
图片形式的样本数据集:
预期结果:
提前致谢。
答案 0 :(得分:1)
要获得结果,您不需要循环。如果您检查事务的开始和结束位置并相应地编制索引,则代码将简化为
nLAST <- which(tnx$LAST)
nFIRST <- which(tnx$FIRST)
count <- sapply(1:length(nFIRST),FUN = function(i){nFIRST[i]:nLAST[i]})
mode <- unlist(lapply(count,FUN=function(x){ifelse(length(unique(tnx$mode[x]))==1,
as.character(unique(tnx$mode[x])),'both')}))
final_tnx <- data.frame(id = tnx$id[nFIRST],start_date = tnx$start_date[nFIRST],
start_time = tnx$start_time[nFIRST],final_end_date = tnx$end_date[nLAST],
final_end_time = tnx$end_time[nLAST], start_loc=tnx$start_loc[nFIRST],
final_end_loc = tnx$end_loc[nLAST],final_mode = mode,
final_count = nLAST - nFIRST +1)
这应该明确加快速度,并在较大的数据集上表现良好。
编辑:当允许模式多次更改时,您必须检查所有子集的唯一性。在count
中,我为每条记录构建了一个索引序列列表。然后在索引列表上应用一个函数,该函数检查子集中是否存在一个或多个模式。
答案 1 :(得分:0)
我确信还有更多的改进,但如果你在循环中尽可能少地索引并将数据指定为矢量,你可以看到一些改进。
require("rbenchmark")
###Specify data as vectors
FIRST <- tnx$FIRST
mode <- tnx$mode
LAST <- tnx$LAST
final_end_loc <- tnx$final_end_loc
final_end_date <- tnx$final_end_date
final_end_time <- tnx$final_end_time
final_mode <- tnx$final_mode
final_count <- tnx$final_count
end_date <- tnx$end_date
end_time <- tnx$end_time
end_loc <- tnx$end_loc
benchmark(for (i in 1:n) {
if(FIRST[i]){
firstid<-i
currTpt <-mode[i]
count <-1
}
else{
count <- count + 1
}
if(as.character(mode[i])!= as.character(currTpt)){
currTpt <- 'both'
}
if(LAST[i])
{
final_end_loc[firstid]<-end_loc[i]
final_end_date[firstid]<-as.character(end_date[i])
final_end_time[firstid]<-as.character(end_time[i])
final_mode[firstid]<-as.character(currTpt)
final_count[firstid] <- count
}
})
replications elapsed relative user.self sys.self user.child sys.child
1 100 0.11 1 0.11 0 NA NA
现在你的循环
replications elapsed relative user.self sys.self user.child sys.child
1 100 0.18 1 0.19 0 NA NA
无法确定这是否适用于大型数据集但是将索引保持在最低限度对我来说过去是有效的。如果这对您来说不够快或者对大数据不起作用,可以在Speed up the loop operation in R找到一个好的帖子。