我有一个数据帧,其中每一行代表时间序列的一部分。
我需要创建一个长达数年的总时间序列,最多数百个单位。
因此,每一行都会在特定时间段内设置一个值,然后需要恢复到给定的最大值(由maks提供)。
在此处查看示例:
代码:
library(tidyr)
library(dplyr)
# My data for 3 units
df <- structure(list(Unit = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L), .Label = c("A", "B", "C"), class = "factor"), Limit = c(850L,
655L, 500L, 1000L, 100L, 75L, 0L, 600L, 635L), Max = c(1310L,
1310L, 1310L, 1300L, 1300L, 1300L, 915L, 915L, 915L), startDate = structure(c(1483250400,
1430481600, 1546286400, 1421280000, 1498813200, 1546300800, 1420869600,
1527876000, 1463097600), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
endDate = structure(c(1496275200, 1451520000, 1609459200,
1426431600, 1527811200, 1577836800, 1433170800, 1546383600,
1464807600), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-9L), class = "data.frame")
# Doing a loop to create time series for each row
d <- NULL
for(i in 1:nrow(df)) {
d <- rbind(d, data.frame(Date = seq.POSIXt(from = df$startDate[i], to = df$endDate[i], by = "hour"),
value = df$Limit[i],
unit = df$Unit[i]))
}
# Spread it out to a nice data frame
d <- spread(d, unit, value = value)
# Left join on a global time series
globalStart <- as.POSIXct("2015-01-01 00:00:00", tz = "UTC")
globalEnd <- as.POSIXct("2021-12-01 00:00:00", tz = "UTC")
dfResult <- data.frame(Date = seq.POSIXt(from = globalStart, to = globalEnd, by = "hour"))
# Now join it together
dfResult <- left_join(dfResult, d, by = "Date")
# Add values to fill out NA with max
maks <- c(915, 1315, 900)
dfResult[is.na(dfResult[, 2]), 2] <- maks[1]
dfResult[is.na(dfResult[, 3]), 3] <- maks[2]
dfResult[is.na(dfResult[, 4]), 4] <- maks[3]
# Final result
dfResult
我的问题是我的数据集大约需要35分钟,而这个数据集只有58个单位,可能我需要成千上万个单位来做-我需要大大加快速度。
答案 0 :(得分:1)
一种选择是用以下代码替换循环:
# creates date list:
dateList <- lapply(1:nrow(df), function(x) {
seq.POSIXt(df$startDate[x], df$endDate[x], by = "hour")
})
llengths <- lengths(dateList) # lengths of date vectors
# combine everything together:
d <- data.frame(Date = do.call("c", dateList),
value = rep(df$Limit, llengths),
unit = rep(df$Unit, llengths))
您的代码的主要问题是使用rbind
。尝试避免这种情况,尤其是在循环中。
答案 1 :(得分:1)
可以尝试减少步骤数并使用data.table
。
您可以使用foverlaps()
合并到整个时间轴,然后使用dcast()
进行传播,而不是扩展原始data.frame。然后只需参考NA值即可更新。
library(data.table)
df <- structure(list(Unit = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
Limit = c(850L, 655L, 500L, 1000L, 100L, 75L, 0L, 600L, 635L), Max = c(1310L, 1310L, 1310L, 1300L, 1300L, 1300L, 915L, 915L, 915L),
startDate = structure(c(1483250400, 1430481600, 1546286400, 1421280000, 1498813200, 1546300800, 1420869600, 1527876000, 1463097600), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
endDate = structure(c(1496275200, 1451520000, 1609459200, 1426431600, 1527811200, 1577836800, 1433170800, 1546383600, 1464807600), class = c("POSIXct", "POSIXt"), tzone = "UTC")),
row.names = c(NA, -9L), class = "data.frame")
# use data.table with key
dataset <- as.data.table(df)
setkey(dataset, "startDate", "endDate")
# start - end for whole timeline
globalStart <- as.POSIXct("2015-01-01 00:00:00", tz = "UTC")
globalEnd <- as.POSIXct("2021-12-01 00:00:00", tz = "UTC")
# data.table of start and end for each period, used with `foverlaps()`
timeline_dt <- data.table(Dttm = seq.POSIXt(globalStart, globalEnd, "hour"),
endDate = seq.POSIXt(globalStart, globalEnd, "hour") + 3599L)
# not required to be keyed but may as well
setkey(timeline_dt, "Dttm", "endDate")
# join by period overlapping, retain columns of interest
dataset_result <- foverlaps(timeline_dt, dataset)[, .(Dttm, Unit, Limit)]
# same as spread, drop NA column
dataset_result <- dcast(dataset_result, Dttm ~ Unit, value.var = "Limit")[, -"NA"]
# iterate over maks, and update by reference
# i + 1L because we skip first column which is Dttm
maks <- c(915, 1315, 900)
for (i in seq_along(maks)) {
set(x = dataset_result,
i = which(is.na(dataset_result[[i+1L]])),
j = i+1L,
value = maks[i])
}