在非常大的数据集中有效地索引日期范围

时间:2018-02-14 22:04:37

标签: r data.table

我有两个非常大的相关数据框,其中包含来自深度记录器的输出数据。

编辑:MWE的数据下载更轻松:

library(RCurl)
dives_log <- read.csv(text = getURL("https://raw.githubusercontent.com/sebpardo/dive-data-mwe/master/dives_log.csv"), 
                      stringsAsFactors = FALSE)
dives_summary <- read.csv(text = getURL("https://raw.githubusercontent.com/sebpardo/dive-data-mwe/master/dives_summary.csv"), 
                      stringsAsFactors = FALSE)

dives_log$Date <- as.POSIXct(dives_log$Date, "%Y-%m-%d %H:%M:%S", tz="GMT")
dives_summary$Start <- as.POSIXct(dives_summary$Start, "%Y-%m-%d %H:%M:%S", tz="GMT")
dives_summary$Bottom.Start <- as.POSIXct(dives_summary$Bottom.Start, "%Y-%m-%d %H:%M:%S", tz="GMT")
dives_summary$Ascent.Start <- as.POSIXct(dives_summary$Ascent.Start, "%Y-%m-%d %H:%M:%S", tz="GMT")
dives_summary$Ascent.End <- as.POSIXct(dives_summary$Ascent.End, "%Y-%m-%d %H:%M:%S", tz="GMT")

第一个(dives_log)每10秒包含一次记录器数据:

> head(dives_log)
                 Date Depth
1 2010-09-11 23:03:20   6.5
2 2010-09-11 23:03:30   6.5
3 2010-09-11 23:03:40   7.0
4 2010-09-11 23:03:50   7.0
5 2010-09-11 23:04:00   7.0
6 2010-09-11 23:04:10   0.0
> nrow(dives_log)
[1] 816036

而第二个(dives_summary)包含潜水配置文件的摘要,包括记录的每个潜水周期的开始,最低和上升时间。

> head(dives_summary)
  Dive               Start        Bottom.Start        Ascent.Start          Ascent.End
1    1 2010-09-11 22:59:20 2010-09-11 23:03:20 2010-09-11 23:04:00 2010-09-11 23:04:10
2    2 2010-09-11 23:04:40 2010-09-11 23:04:50 2010-09-11 23:07:20 2010-09-11 23:08:30
3    3 2010-09-11 23:09:00 2010-09-11 23:13:00 2010-09-11 23:17:30 2010-09-11 23:18:00
4    4 2010-09-11 23:18:40 2010-09-11 23:19:00 2010-09-11 23:26:50 2010-09-11 23:27:20
5    5 2010-09-11 23:28:10 2010-09-11 23:28:50 2010-09-11 23:35:40 2010-09-11 23:36:20
6    6 2010-09-11 23:37:10 2010-09-11 23:37:30 2010-09-11 23:44:40 2010-09-11 23:45:30
> nrow(dives_summary)
[1] 12697

我想要做的是将潜水号码(dives_summary$Dive)和潜水阶段(底部或上升)分配到dives_log中的每个个别数据点(即行) 。到目前为止,我这样做的方法是将dives_log中与dives_summary每行中的时间范围匹配的行编入索引:

for(i in 1:nrow(dives_summary)) {  
  pos.bottom <- which(dives_log$Date >= dives_summary$Bottom.Start[i] & 
                        dives_log$Date <= dives_summary$Ascent.Start[i])
  pos.ascent <- which(dives_log$Date > dives_summary$Ascent.Start[i] & 
                        dives_log$Date <= dives_summary$Ascent.End[i])

  dives_log[pos.bottom, "Phase"] <- "bottom"
  dives_log[pos.ascent, "Phase"] <- "ascent"
  dives_log[pos.bottom, "Number"] <- dives_summary[i, "Dive"]
  dives_log[pos.ascent, "Number"] <- dives_summary[i, "Dive"]
}

这就是诀窍,但是对于dives_summary的12697行中的每一行,它都非常慢,for循环必须检查多个向量的逻辑语句比较800k +行长:

> head(dives_log)
                 Date Depth  Phase Number
1 2010-09-11 23:03:20   6.5 bottom      1
2 2010-09-11 23:03:30   6.5 bottom      1
3 2010-09-11 23:03:40   7.0 bottom      1
4 2010-09-11 23:03:50   7.0 bottom      1
5 2010-09-11 23:04:00   7.0 bottom      1
6 2010-09-11 23:04:10   0.0 ascent      1

以更快的方式做到这一点的方法是什么?我想这可以使用data.table来实现,但是我无法理解如何在两个独立的数据帧中向量化逻辑语句。任何指导将不胜感激!

3 个答案:

答案 0 :(得分:4)

不完全确定melt对@Frank的意义。也许他可以更好地说明。

这是使用非equi连接的版本。

#get to the bottom
dives_log[dives_summary, ':=' (
    Phase = 'bottom',
    Number = Dive
    ), on=.(Date >= Bottom_Start, Date <= Ascent_Start)]

#rise to the top
dives_log[dives_summary, ':=' (
    Phase = 'ascent',
    Number = Dive
), on=.(Date > Ascent_Start, Date <= Ascent_End)]

数据:

dives_log <- fread('Date,Depth
"2010-09-11 23:03:20",6.5
"2010-09-11 23:03:30",6.5
"2010-09-11 23:03:40",7.0
"2010-09-11 23:03:50",7.0
"2010-09-11 23:04:00",7.0
"2010-09-11 23:04:10",0.0')[, 
    Date := as.POSIXct(Date)]

cols <- c("Start","Bottom_Start","Ascent_Start","Ascent_End")
dives_summary <- fread('Dive,Start,Bottom_Start,Ascent_Start,Ascent_End
1,"2010-09-11 22:59:20","2010-09-11 23:03:20","2010-09-11 23:04:00","2010-09-11 23:04:10"
2,"2010-09-11 23:04:40","2010-09-11 23:04:50","2010-09-11 23:07:20","2010-09-11 23:08:30"
3,"2010-09-11 23:09:00","2010-09-11 23:13:00","2010-09-11 23:17:30","2010-09-11 23:18:00"
4,"2010-09-11 23:18:40","2010-09-11 23:19:00","2010-09-11 23:26:50","2010-09-11 23:27:20"
5,"2010-09-11 23:28:10","2010-09-11 23:28:50","2010-09-11 23:35:40","2010-09-11 23:36:20"
6,"2010-09-11 23:37:10","2010-09-11 23:37:30","2010-09-11 23:44:40","2010-09-11 23:45:30"')[, 
    (cols) := lapply(.SD, as.POSIXct), .SDcols=cols]

答案 1 :(得分:0)

在这种情况下,可以使用基于sqldf的解决方案来避免for-loop。方法是首先连接2个数据帧,首先匹配bottom阶段的条件,然后UNION输出,并加入以匹配ascent阶段的条件。

# Data
dives_log <- read.table(text = "Date Depth
'2010-09-11 23:03:20'   6.5
'2010-09-11 23:03:30'   6.5
'2010-09-11 23:03:40'   7.0
'2010-09-11 23:03:50'   7.0
'2010-09-11 23:04:00'   7.0
'2010-09-11 23:04:10'   0.0", header = T, stringsAsFactors = F)

#Convert to POSIXct format
dives_log$Date <- as.POSIXct(dives_log$Date, format = "%Y-%m-%d %H:%M:%S")


dives_summary <- read.table(text = "Dive   Start   Bottom_Start  Ascent_Start   Ascent_End
1 '2010-09-11 22:59:20' '2010-09-11 23:03:20' '2010-09-11 23:04:00' '2010-09-11 23:04:10'
2 '2010-09-11 23:04:40' '2010-09-11 23:04:50' '2010-09-11 23:07:20' '2010-09-11 23:08:30'
3 '2010-09-11 23:09:00' '2010-09-11 23:13:00' '2010-09-11 23:17:30' '2010-09-11 23:18:00'
4 '2010-09-11 23:18:40' '2010-09-11 23:19:00' '2010-09-11 23:26:50' '2010-09-11 23:27:20'
5 '2010-09-11 23:28:10' '2010-09-11 23:28:50' '2010-09-11 23:35:40' '2010-09-11 23:36:20'
6 '2010-09-11 23:37:10' '2010-09-11 23:37:30' '2010-09-11 23:44:40' '2010-09-11 23:45:30'",
 header = T, stringsAsFactor = F)

#convert columns to POSIXct format
dives_summary$Start <- as.POSIXct(dives_summary$Start, format = "%Y-%m-%d %H:%M:%S")
dives_summary$Bottom_Start <- as.POSIXct(dives_summary$Bottom_Start, format = "%Y-%m-%d %H:%M:%S")
dives_summary$Ascent_Start <- as.POSIXct(dives_summary$Ascent_Start, format = "%Y-%m-%d %H:%M:%S")
dives_summary$Ascent_End <- as.POSIXct(dives_summary$Ascent_End, format = "%Y-%m-%d %H:%M:%S")


library(sqldf)

result <- sqldf("SELECT dives_log.*, 'bottom' as Phase, dives_summary.Dive as Number 
FROM dives_log, dives_summary 
WHERE dives_log.Date BETWEEN dives_summary.Bottom_Start AND  dives_summary.Ascent_Start
UNION 
SELECT dives_log.*, 'ascent' as Phase, dives_summary.Dive as Number 
FROM dives_log, dives_summary 
WHERE dives_log.Date > dives_summary.Ascent_Start AND
dives_log.Date <= dives_summary.Ascent_End")

result

#                 Date Depth  Phase Number
#1 2010-09-11 23:03:20   6.5 bottom      1
#2 2010-09-11 23:03:30   6.5 bottom      1
#3 2010-09-11 23:03:40   7.0 bottom      1
#4 2010-09-11 23:03:50   7.0 bottom      1
#5 2010-09-11 23:04:00   7.0 bottom      1
#6 2010-09-11 23:04:10   0.0 ascent      1

答案 2 :(得分:0)

使用for循环通常非常慢。我总是尝试用lapply替换它们:

new_cols <- lapply(1:nrow(dives_summary), function(i) {
    pos.bottom <- which(dives_log$Date >= dives_summary$Bottom.Start[i] & 
                            dives_log$Date <= dives_summary$Ascent.Start[i])
    pos.ascent <- which(dives_log$Date > dives_summary$Ascent.Start[i] & 
                            dives_log$Date <= dives_summary$Ascent.End[i])

    phase <- c(rep('bottom', length(pos.bottom)), rep('ascent', length(pos.ascent)))
    number <- rep(i, length(c(pos.bottom, pos.ascent)))
    list(Phase=phase, Number=number)
})

dives_log$Phase <- unlist(sapply(new_cols, `[[`, 1)) # Get Phase
dives_log$Number <- unlist(sapply(new_cols, `[[`, 2)) # Get Number

在我的电脑中,需要2.5分钟。如果你并行执行(我使用包mclapply中的parallel),则需要52秒,包含4个内核。