我有一个数据集,其中一些记录被分成两行(Bolus Type = Dual),我试图根据每个记录之间最多10分钟合并它们。这里有一个数据示例:
<!-- https://mvnrepository.com/artifact/xalan/serializer -->
<dependency>
<groupId>xalan</groupId>
<artifactId>serializer</artifactId>
<version>2.7.2</version>
</dependency>
我目前使用的代码有效但运行速度太慢,无法在我的某些文件上运行(> 30MB)。我考虑对数据进行子集化,以便它只显示 Time Bolus Type Bolus Volume
1 0.0 Dual (normal part only) 1
2 0.2 Dual (square part only) 2
3 0.4 Normal 3
4 0.6 Dual (normal part only) 2
5 0.8 Dual (square part only) 1
6 1.0 Normal 3
包含“Dual *”的记录,但后来我不知道如何返回原始记录以使推注类型合并。
Bolus Type
编辑: 期望的输出(根据当前代码输出)
#GENERATING SAMPLE DATA
a<-seq(0,1,length.out=6)
b<-c("Dual (normal part only)","Dual (square part only)","Normal","Dual (normal part only)","Dual (square part only)","Normal")
c<-c(1,2,3,2,1,3)
pt<-data.frame(a,b,c, stringsAsFactors = FALSE)
colnames(pt)<-c('Time', 'Bolus Type','Bolus Volume')
countDual=0
countSquare=0
countNormal=0
min10<-0.3
#FIND EACH "DUAL NORMAL" PART
for(i in 1:nrow(pt)) {
if(!is.na(pt$`Bolus Type`[i])&&pt$`Bolus Type`[i]=="Dual (normal part only)"){
j<-i
time_lim<-pt$`Time`[i]+min10
found_square<-0
#LOOK AT ROWS AHEAD OF I TO FIND NEAREST SQUARE WITHIN TIME PERIOD (0.2)
while(pt$`Time`[j]<time_lim&&found_square!=1){
if(!is.na(pt$`Bolus Type`[j])&&pt$`Bolus Type`[j]=="Dual (square part only)"){
date<-pt[i,"Date"]
time<-pt[i,"Time"]
total_DB<-pt[i,"Bolus Volume"]+pt[j,"Bolus Volume"]
#ADD DUAL TOTAL ROW AT BOTTOM
row_num<-nrow(pt) + 1
pt[row_num,"Time"] = time
pt[row_num,"Bolus Type"] = "Dual (total)"
pt[row_num,"Bolus Volume"] = total_DB
found_square<-1 #Exit loop when finds first square within 10 minutes
countDual<-countDual+1
#MARK THE "LINKED" RECORDS
pt[j,"Bolus Type"]<-"Dual (square part)"
pt[i,"Bolus Type"]<-"Dual (normal part)"
}
j<-j+1
}
}
}
#MESSAGE OUT RESULTS
countNormalOnly<-sum(pt$`Bolus Type`[!is.na(pt$`Bolus Type`)]=="Dual (normal part only)")
countSquareOnly<-sum(pt$`Bolus Type`[!is.na(pt$`Bolus Type`)]=="Dual (square part only)")
message(paste(c("Dual:",countDual," Square only:",countSquareOnly," Normal only:",countNormalOnly)))
答案 0 :(得分:0)
library(dplyr)
library(stringr)
library(data.table)
pt_joined <-
pt %>%
filter(`Bolus Type` %like% 'Dual') %>%
mutate(Index = cumsum(`Bolus Type` %like% 'normal')) %>%
group_by(Index) %>%
mutate(time_diff = max(Time - lag(Time), na.rm = TRUE)) %>%
filter(time_diff <= .2) %>%
mutate(`Bolus Type` = str_remove_all(`Bolus Type`, " only")) %>%
ungroup %>% select(-time_diff)
pt_dual <- pt_joined %>%
group_by(`Index`) %>%
summarize(
Time = min(Time),
`Bolus Type` = 'Dual',
`Bolus Volume` = sum(`Bolus Volume`)
)
pt_new <-
pt %>%
left_join(pt_joined, by = c("Time", "Bolus Volume")) %>%
mutate(`Bolus Type` =
ifelse(is.na(`Bolus Type.y`), `Bolus Type.x`, `Bolus Type.y`)) %>%
select(-contains(".")) %>%
bind_rows(pt_dual) %>% select(-Index)