我有两个非常大的相关数据框,其中包含来自深度记录器的输出数据。
编辑:MWE的数据下载更轻松:
library(RCurl)
dives_log <- read.csv(text = getURL("https://raw.githubusercontent.com/sebpardo/dive-data-mwe/master/dives_log.csv"),
stringsAsFactors = FALSE)
dives_summary <- read.csv(text = getURL("https://raw.githubusercontent.com/sebpardo/dive-data-mwe/master/dives_summary.csv"),
stringsAsFactors = FALSE)
dives_log$Date <- as.POSIXct(dives_log$Date, "%Y-%m-%d %H:%M:%S", tz="GMT")
dives_summary$Start <- as.POSIXct(dives_summary$Start, "%Y-%m-%d %H:%M:%S", tz="GMT")
dives_summary$Bottom.Start <- as.POSIXct(dives_summary$Bottom.Start, "%Y-%m-%d %H:%M:%S", tz="GMT")
dives_summary$Ascent.Start <- as.POSIXct(dives_summary$Ascent.Start, "%Y-%m-%d %H:%M:%S", tz="GMT")
dives_summary$Ascent.End <- as.POSIXct(dives_summary$Ascent.End, "%Y-%m-%d %H:%M:%S", tz="GMT")
第一个(dives_log
)每10秒包含一次记录器数据:
> head(dives_log)
Date Depth
1 2010-09-11 23:03:20 6.5
2 2010-09-11 23:03:30 6.5
3 2010-09-11 23:03:40 7.0
4 2010-09-11 23:03:50 7.0
5 2010-09-11 23:04:00 7.0
6 2010-09-11 23:04:10 0.0
> nrow(dives_log)
[1] 816036
而第二个(dives_summary
)包含潜水配置文件的摘要,包括记录的每个潜水周期的开始,最低和上升时间。
> head(dives_summary)
Dive Start Bottom.Start Ascent.Start Ascent.End
1 1 2010-09-11 22:59:20 2010-09-11 23:03:20 2010-09-11 23:04:00 2010-09-11 23:04:10
2 2 2010-09-11 23:04:40 2010-09-11 23:04:50 2010-09-11 23:07:20 2010-09-11 23:08:30
3 3 2010-09-11 23:09:00 2010-09-11 23:13:00 2010-09-11 23:17:30 2010-09-11 23:18:00
4 4 2010-09-11 23:18:40 2010-09-11 23:19:00 2010-09-11 23:26:50 2010-09-11 23:27:20
5 5 2010-09-11 23:28:10 2010-09-11 23:28:50 2010-09-11 23:35:40 2010-09-11 23:36:20
6 6 2010-09-11 23:37:10 2010-09-11 23:37:30 2010-09-11 23:44:40 2010-09-11 23:45:30
> nrow(dives_summary)
[1] 12697
我想要做的是将潜水号码(dives_summary$Dive
)和潜水阶段(底部或上升)分配到dives_log
中的每个个别数据点(即行) 。到目前为止,我这样做的方法是将dives_log
中与dives_summary
每行中的时间范围匹配的行编入索引:
for(i in 1:nrow(dives_summary)) {
pos.bottom <- which(dives_log$Date >= dives_summary$Bottom.Start[i] &
dives_log$Date <= dives_summary$Ascent.Start[i])
pos.ascent <- which(dives_log$Date > dives_summary$Ascent.Start[i] &
dives_log$Date <= dives_summary$Ascent.End[i])
dives_log[pos.bottom, "Phase"] <- "bottom"
dives_log[pos.ascent, "Phase"] <- "ascent"
dives_log[pos.bottom, "Number"] <- dives_summary[i, "Dive"]
dives_log[pos.ascent, "Number"] <- dives_summary[i, "Dive"]
}
这就是诀窍,但是对于dives_summary
的12697行中的每一行,它都非常慢,for循环必须检查多个向量的逻辑语句比较800k +行长:
> head(dives_log)
Date Depth Phase Number
1 2010-09-11 23:03:20 6.5 bottom 1
2 2010-09-11 23:03:30 6.5 bottom 1
3 2010-09-11 23:03:40 7.0 bottom 1
4 2010-09-11 23:03:50 7.0 bottom 1
5 2010-09-11 23:04:00 7.0 bottom 1
6 2010-09-11 23:04:10 0.0 ascent 1
以更快的方式做到这一点的方法是什么?我想这可以使用data.table
来实现,但是我无法理解如何在两个独立的数据帧中向量化逻辑语句。任何指导将不胜感激!
答案 0 :(得分:4)
不完全确定melt
对@Frank的意义。也许他可以更好地说明。
这是使用非equi连接的版本。
#get to the bottom
dives_log[dives_summary, ':=' (
Phase = 'bottom',
Number = Dive
), on=.(Date >= Bottom_Start, Date <= Ascent_Start)]
#rise to the top
dives_log[dives_summary, ':=' (
Phase = 'ascent',
Number = Dive
), on=.(Date > Ascent_Start, Date <= Ascent_End)]
数据:
dives_log <- fread('Date,Depth
"2010-09-11 23:03:20",6.5
"2010-09-11 23:03:30",6.5
"2010-09-11 23:03:40",7.0
"2010-09-11 23:03:50",7.0
"2010-09-11 23:04:00",7.0
"2010-09-11 23:04:10",0.0')[,
Date := as.POSIXct(Date)]
cols <- c("Start","Bottom_Start","Ascent_Start","Ascent_End")
dives_summary <- fread('Dive,Start,Bottom_Start,Ascent_Start,Ascent_End
1,"2010-09-11 22:59:20","2010-09-11 23:03:20","2010-09-11 23:04:00","2010-09-11 23:04:10"
2,"2010-09-11 23:04:40","2010-09-11 23:04:50","2010-09-11 23:07:20","2010-09-11 23:08:30"
3,"2010-09-11 23:09:00","2010-09-11 23:13:00","2010-09-11 23:17:30","2010-09-11 23:18:00"
4,"2010-09-11 23:18:40","2010-09-11 23:19:00","2010-09-11 23:26:50","2010-09-11 23:27:20"
5,"2010-09-11 23:28:10","2010-09-11 23:28:50","2010-09-11 23:35:40","2010-09-11 23:36:20"
6,"2010-09-11 23:37:10","2010-09-11 23:37:30","2010-09-11 23:44:40","2010-09-11 23:45:30"')[,
(cols) := lapply(.SD, as.POSIXct), .SDcols=cols]
答案 1 :(得分:0)
在这种情况下,可以使用基于sqldf
的解决方案来避免for-loop
。方法是首先连接2个数据帧,首先匹配bottom
阶段的条件,然后UNION
输出,并加入以匹配ascent
阶段的条件。
# Data
dives_log <- read.table(text = "Date Depth
'2010-09-11 23:03:20' 6.5
'2010-09-11 23:03:30' 6.5
'2010-09-11 23:03:40' 7.0
'2010-09-11 23:03:50' 7.0
'2010-09-11 23:04:00' 7.0
'2010-09-11 23:04:10' 0.0", header = T, stringsAsFactors = F)
#Convert to POSIXct format
dives_log$Date <- as.POSIXct(dives_log$Date, format = "%Y-%m-%d %H:%M:%S")
dives_summary <- read.table(text = "Dive Start Bottom_Start Ascent_Start Ascent_End
1 '2010-09-11 22:59:20' '2010-09-11 23:03:20' '2010-09-11 23:04:00' '2010-09-11 23:04:10'
2 '2010-09-11 23:04:40' '2010-09-11 23:04:50' '2010-09-11 23:07:20' '2010-09-11 23:08:30'
3 '2010-09-11 23:09:00' '2010-09-11 23:13:00' '2010-09-11 23:17:30' '2010-09-11 23:18:00'
4 '2010-09-11 23:18:40' '2010-09-11 23:19:00' '2010-09-11 23:26:50' '2010-09-11 23:27:20'
5 '2010-09-11 23:28:10' '2010-09-11 23:28:50' '2010-09-11 23:35:40' '2010-09-11 23:36:20'
6 '2010-09-11 23:37:10' '2010-09-11 23:37:30' '2010-09-11 23:44:40' '2010-09-11 23:45:30'",
header = T, stringsAsFactor = F)
#convert columns to POSIXct format
dives_summary$Start <- as.POSIXct(dives_summary$Start, format = "%Y-%m-%d %H:%M:%S")
dives_summary$Bottom_Start <- as.POSIXct(dives_summary$Bottom_Start, format = "%Y-%m-%d %H:%M:%S")
dives_summary$Ascent_Start <- as.POSIXct(dives_summary$Ascent_Start, format = "%Y-%m-%d %H:%M:%S")
dives_summary$Ascent_End <- as.POSIXct(dives_summary$Ascent_End, format = "%Y-%m-%d %H:%M:%S")
library(sqldf)
result <- sqldf("SELECT dives_log.*, 'bottom' as Phase, dives_summary.Dive as Number
FROM dives_log, dives_summary
WHERE dives_log.Date BETWEEN dives_summary.Bottom_Start AND dives_summary.Ascent_Start
UNION
SELECT dives_log.*, 'ascent' as Phase, dives_summary.Dive as Number
FROM dives_log, dives_summary
WHERE dives_log.Date > dives_summary.Ascent_Start AND
dives_log.Date <= dives_summary.Ascent_End")
result
# Date Depth Phase Number
#1 2010-09-11 23:03:20 6.5 bottom 1
#2 2010-09-11 23:03:30 6.5 bottom 1
#3 2010-09-11 23:03:40 7.0 bottom 1
#4 2010-09-11 23:03:50 7.0 bottom 1
#5 2010-09-11 23:04:00 7.0 bottom 1
#6 2010-09-11 23:04:10 0.0 ascent 1
答案 2 :(得分:0)
使用for循环通常非常慢。我总是尝试用lapply
替换它们:
new_cols <- lapply(1:nrow(dives_summary), function(i) {
pos.bottom <- which(dives_log$Date >= dives_summary$Bottom.Start[i] &
dives_log$Date <= dives_summary$Ascent.Start[i])
pos.ascent <- which(dives_log$Date > dives_summary$Ascent.Start[i] &
dives_log$Date <= dives_summary$Ascent.End[i])
phase <- c(rep('bottom', length(pos.bottom)), rep('ascent', length(pos.ascent)))
number <- rep(i, length(c(pos.bottom, pos.ascent)))
list(Phase=phase, Number=number)
})
dives_log$Phase <- unlist(sapply(new_cols, `[[`, 1)) # Get Phase
dives_log$Number <- unlist(sapply(new_cols, `[[`, 2)) # Get Number
在我的电脑中,需要2.5分钟。如果你并行执行(我使用包mclapply
中的parallel
),则需要52秒,包含4个内核。