我有一个数据框,例如:
TestB
但是,实际上有 60,000行和26个变量!
我想实现的是这样:
data <- data.frame("date" = c("2015-05-01 14:12:57",
"2015-05-01 14:14:57",
"2015-05-01 14:15:57",
"2015-05-01 14:42:57",
"2015-05-01 14:52:57"),
"Var1" = c(2,3,4,2,1),
"Var2" = c(0.53,0.3,0.34,0.12,0.91),
"Var3" = c(1,1,1,1,1))
data
date Var1 Var2 Var3
1 2015-05-01 14:12:57 2 0.53 1
2 2015-05-01 14:14:57 3 0.30 1
3 2015-05-01 14:15:57 4 0.34 1
4 2015-05-01 14:42:57 2 0.12 1
5 2015-05-01 14:52:57 1 0.91 1
理论上: 根据最近15分钟的观察结果,为每行数据计算平均值(Var1和Var2以及Var3的总和)。
我想到了:
unix_timestamp Var1 Var2 Var3
1 2015-05-01 14:12:57 2.0 0.530 1
2 2015-05-01 14:14:57 2.5 0.415 2
3 2015-05-01 14:15:57 3.0 0.390 3
4 2015-05-01 14:42:57 2.0 0.120 1
5 2015-05-01 14:52:57 1.5 0.515 2
...实际上返回的是期望的结果。但是,对于具有60,000行的数据框,这行不通或需要100年的时间(别忘了我实际上有26个变量)。
有人知道我如何摆脱循环或如何调整功能吗?会如此感激!我也尝试了sapply,但似乎速度不快或做错了什么。
谢谢您的帮助!
答案 0 :(得分:4)
使用dplyr
,我们可以将date
转换为POSIXct
类,使用cut
将其分解为15分钟的间隔,然后取各列的累积平均值和总和
library(dplyr)
data %>%
group_by(group = cut(as.POSIXct(date), breaks = "15 mins")) %>%
mutate_at(vars(Var1, Var2), cummean) %>%
mutate_at(vars(Var3), cumsum) %>%
ungroup() %>%
select(-group)
# date Var1 Var2 Var3
# <fct> <dbl> <dbl> <dbl>
#1 2015-05-01 14:12:57 2 0.53 1
#2 2015-05-01 14:14:57 2.5 0.415 2
#3 2015-05-01 14:15:57 3 0.39 3
#4 2015-05-01 14:42:57 2 0.12 1
#5 2015-05-01 14:52:57 1.5 0.515 2
使用mutate_at
,因为有26个变量,因此我们可以一次将同一函数应用于多个列。
编辑
基于@Rentrop的评论,使用他的数据更新了答案。
library(dplyr)
library(purrr)
dat %>%
mutate(date = as.POSIXct(date),
Var1 = map_dbl(date, ~mean(Var1[date >= (.x - (15 * 60)) & date <= .x])),
Var2 = map_dbl(date, ~mean(Var2[date >= (.x - (15 * 60)) & date <= .x])),
Var3 = map_dbl(date, ~sum(Var3[date >= (.x - (15 * 60)) & date <= .x])))
# date Var1 Var2 Var3
#1 2015-05-01 14:12:57 2.0 0.530 1
#2 2015-05-01 14:14:57 2.5 0.415 2
#3 2015-05-01 14:29:57 3.5 0.320 2
#4 2015-05-01 14:42:57 3.0 0.230 2
#5 2015-05-01 14:52:57 1.5 0.515 2
答案 1 :(得分:3)
将第三次输入时间从14:15
更改为14:29
require(tidyverse)
require(lubridate)
dat <- data.frame("date" = c("2015-05-01 14:12:57",
"2015-05-01 14:14:57",
"2015-05-01 14:29:57",
"2015-05-01 14:42:57",
"2015-05-01 14:52:57"),
"Var1" = c(2,3,4,2,1),
"Var2" = c(0.53,0.3,0.34,0.12,0.91),
"Var3" = c(1,1,1,1,1))
您可以执行以下操作
dat <- dat %>% mutate(date = as.POSIXct(date, tz = ""))
in_15 <- map(dat[["date"]], ~between(dat[["date"]], left = .x - minutes(15), right = .x))
map_df(in_15, ~filter(dat, .x) %>%
summarise(date = last(date), Var1 = mean(Var1), Var2 = mean(Var2), Var3 = sum(Var3)))
结果
date Var1 Var2 Var3
1 2015-05-01 14:12:57 2.0 0.530 1
2 2015-05-01 14:14:57 2.5 0.415 2
3 2015-05-01 14:29:57 3.5 0.320 2
4 2015-05-01 14:42:57 3.0 0.230 2
5 2015-05-01 14:52:57 1.5 0.515 2
答案 2 :(得分:2)
这是一个data.table
解决方案,它使用non-equi
进行联接并使用.EACHI
进行聚合。
setDT(data)
data[, date := as.POSIXct(date)]
data[, date_min := date - 15*60]
data[data, on = .(date >= date_min
, date <= date)
, .(mean(Var1), mean(Var2), sum(Var3))
, allow.cartesian = T
, by = .EACHI
][, date:= NULL][]
date V1 V2 V3
1: 2015-05-01 14:12:57 2.0 0.530 1
2: 2015-05-01 14:14:57 2.5 0.415 2
3: 2015-05-01 14:15:57 3.0 0.390 3
4: 2015-05-01 14:42:57 2.0 0.120 1
5: 2015-05-01 14:52:57 1.5 0.515 2
性能:@Ronak的purrr
解决方案具有最佳性能。
Unit: milliseconds
expr min lq mean median uq max neval
cole_dt 5.0338 5.40155 5.904821 5.63355 5.81995 21.6485 100
ronak_dplyr 6.4104 6.51575 6.764089 6.60685 6.76455 11.8158 100
ronak_purrr 3.3591 3.42850 3.629899 3.50465 3.59220 6.6374 100
rentrop_purrr 17.6355 17.95750 18.832567 18.09150 18.77765 30.9068 100
可重复性代码:
library(microbenchmark)
library(data.table)
library(dplyr)
library(purrr)
library(lubridate)
data <- data.frame("date" = c("2015-05-01 14:12:57",
"2015-05-01 14:14:57",
"2015-05-01 14:29:57",
"2015-05-01 14:42:57",
"2015-05-01 14:52:57"),
"Var1" = c(2,3,4,2,1),
"Var2" = c(0.53,0.3,0.34,0.12,0.91),
"Var3" = c(1,1,1,1,1))
dt <- as.data.table(data)
microbenchmark(
cole_dt = {
dt1 <- copy(dt)
dt1[, date := as.POSIXct(date)]
dt1[, date_min := date - 15*60]
dt1[dt1, on = .(date >= date_min
, date <= date)
, .(mean(Var1), mean(Var2), sum(Var3))
, allow.cartesian = T
, by = .EACHI
][, date:= NULL][]
}
, ronak_dplyr = {
data %>%
group_by(group = cut(as.POSIXct(date), breaks = "15 mins")) %>%
mutate_at(vars(Var1, Var2), cummean) %>%
mutate_at(vars(Var3), cumsum) %>%
ungroup() %>%
select(-group)
}
, ronak_purrr = {
data %>%
mutate(date = as.POSIXct(date),
Var1 = map_dbl(date, ~mean(Var1[date >= (.x - (15 * 60)) & date <= .x])),
Var2 = map_dbl(date, ~mean(Var2[date >= (.x - (15 * 60)) & date <= .x])),
Var3 = map_dbl(date, ~sum(Var3[date >= (.x - (15 * 60)) & date <= .x])))
}
, rentrop_purrr = {
dat <- data %>% mutate(date = as.POSIXct(date, tz = ""))
in_15 <- map(dat[["date"]], ~between(dat[["date"]], left = .x - minutes(15), right = .x))
map_df(in_15, ~filter(dat, .x) %>%
summarise(date = last(date), Var1 = mean(Var1), Var2 = mean(Var2), Var3 = sum(Var3)))
}
)