我知道这个问题并不新鲜,但我的案例包含了之前的回复无法完全解决的一些特征。
我在R中有一个非常大的数据框,称为'df'(包括1400万个元素),格式如下:
ID datetime measurem
1: 1459 2013-01-08 00:00:00 2.24
2: 1459 2013-01-08 01:00:00 2
3: 1459 2013-01-08 02:00:00 2.54
4: 1459 2013-01-08 03:00:00 3.98
5: 1459 2013-01-08 04:00:00 2
6: 1459 2013-01-08 05:00:00 2
7: 1459 2013-01-08 06:00:00 3
....
1007: 2434 2013-01-08 00:00:00 3.45
1008: 2434 2013-01-08 01:00:00 3
1009: 2434 2013-01-08 02:00:00 4
1010: 2434 2013-01-08 03:00:00 5.01
1011: 2434 2013-01-08 04:00:00 4
....
3245: 4780 2013-01-10 00:00:00 3
3246: 4780 2013-01-10 01:00:00 4.73
3247: 4780 2013-01-10 02:00:00 3
df的结构如下:
Classes'data.table'和'data.frame':14103024 obs。 3个变量:
$ ID: chr "1459" "1459" ...
$ datetime : POSIXct, format: "2013-01-08 00:00:00" "2013-01-08 01:00:00" ...
$ measurem: num 2.24 2 2.54 ...
我想首先将能量数据'measurem'转换为每日采用总和,然后转换为每日两次(一次测量直到上午12点,另一次测量直到下午12点),同时保留ID列和日期。由于完整的数据框架太大,我将不胜感激任何可能相对较快的建议。
提前谢谢!
答案 0 :(得分:1)
如果我理解正确,那么我想你想总结一下" measurem " ID,日期和AM / PM的列基础,由于问题中没有样本数据,我自己做出了解决方案:
数据强>:
set.seed(1234)
df <- data.frame(ID=rep(1:5,4),datetime=c("2013-01-08 00:00:00", "2013-01-09 01:00:00", "2013-01-09 13:00:00", "2013-01-08 02:00:00", "2013-01-08 15:00:00",
"2013-01-08 16:00:00", "2013-01-09 01:00:00", "2013-01-09 02:00:00", "2013-01-08 03:00:00", "2013-01-09 18:00:00",
"2013-01-08 14:00:00", "2013-01-09 19:00:00", "2013-01-08 11:00:00", "2013-01-09 10:00:00", "2013-01-08 18:00:00",
"2013-01-09 19:00:00", "2013-01-09 03:00:00", "2013-01-09 02:00:00", "2013-01-09 21:00:00",
"2013-01-09 11:00:00"),measurement=abs(rnorm(20)))
<强>解决方案强>:
datetime <- as.POSIXlt(df$datetime)
date <- as.Date(datetime)
ind <- ifelse(datetime$hour >= 12,"PM","AM")
df$ind <- ind
df$date <- date
1)data.table方式:
library(data.table)
dt <- setDT(df)
dt[,list(count = .N,sum_measure = sum(measurement)),by=list(ID,date,ind)]
2)基地R方式:
fin <- aggregate(measurement ~ ID + ind + date,data=df,sum)
fin[order(fin$ID),]
ID ind date measurement
# 1 AM 2013-01-08 1.20706575
# 1 PM 2013-01-08 0.98324859
# 1 PM 2013-01-09 0.11028549
# 2 AM 2013-01-09 1.36317871
# 2 PM 2013-01-09 0.99838644
# 3 AM 2013-01-08 0.77625389
# 3 AM 2013-01-09 1.45782727
# 3 PM 2013-01-09 1.08444118
# 4 AM 2013-01-08 2.91014970
# 4 AM 2013-01-09 0.06445882
# 4 PM 2013-01-09 0.83717168
# 5 PM 2013-01-08 1.38861875
# 5 AM 2013-01-09 2.41583518
# 5 PM 2013-01-09 0.89003783
答案 1 :(得分:1)
由于生产数据集包含14 M行,OP已请求任何可以相对快速运行的建议。
不幸的是,PKumar接受的答案在速度和内存消耗方面效率很低:
df
,因此存储了两次df
的每次更新都会复制整个对象data.table
解决方案不使用data.table
语法来避免复制操作POSIXlt
需要52个字节来存储一个日期时间实例,而POSIXct
只需要8个字节这是我建议使用data.table
:
# create sample data, see function definition below
df <- create_sample_data(n_id = 4L, n_hr = 24L * 2L)
str(df)
'data.frame': 192 obs. of 3 variables: $ ID : chr "000001" "000001" "000001" "000001" ... $ datetime: POSIXct, format: "2013-01-08 00:00:00" "2013-01-08 01:00:00" "2013-01-08 02:00:00" ... $ measurem: num 1.207 0.277 1.084 2.346 0.429 ...
library(data.table)
# daily aggregates
setDT(df)[, .(sum_measurem = sum(measurem)),
by = .(ID, date = as.IDate(datetime))]
ID date sum_measurem 1: 000001 2013-01-08 18.01187 2: 000001 2013-01-09 22.53423 3: 000002 2013-01-08 21.77239 4: 000002 2013-01-09 15.57561 5: 000003 2013-01-08 14.79938 6: 000003 2013-01-09 20.09797 7: 000004 2013-01-08 15.21066 8: 000004 2013-01-09 25.47120
# bi-daily aggregates
setDT(df)[, .(sum_measurem = sum(measurem)),
by = .(ID, date = as.IDate(datetime), AM = hour(datetime) <= 12L)]
ID date AM sum_measurem 1: 000001 2013-01-08 TRUE 10.677509 2: 000001 2013-01-08 FALSE 7.334362 3: 000001 2013-01-09 TRUE 12.456765 4: 000001 2013-01-09 FALSE 10.077470 5: 000002 2013-01-08 TRUE 12.099480 6: 000002 2013-01-08 FALSE 9.672908 7: 000002 2013-01-09 TRUE 8.672189 8: 000002 2013-01-09 FALSE 6.903426 9: 000003 2013-01-08 TRUE 8.976965 10: 000003 2013-01-08 FALSE 5.822411 11: 000003 2013-01-09 TRUE 11.131718 12: 000003 2013-01-09 FALSE 8.966252 13: 000004 2013-01-08 TRUE 8.413315 14: 000004 2013-01-08 FALSE 6.797342 15: 000004 2013-01-09 TRUE 15.111185 16: 000004 2013-01-09 FALSE 10.360017
create_sample_data <- function(n_id, n_hr) {
set.seed(1234L)
data.frame(
ID = rep(sprintf("%06i", seq_len(n_id)), each = n_hr),
datetime = rep(seq(as.POSIXct("2013-01-08"), length.out = n_hr, by = "1 hour"), n_id),
measurem = abs(rnorm(n_id * n_hr)),
stringsAsFactors = FALSE
)
}
对于基准测试,将为100个唯一的ID
和365天的每小时数据创建样本数据,每个数据都会生成876 K行的样本数据集。由于某些解决方案会修改数据集,因此copy()
用于为每次运行提供未受干扰的数据集。 copy()
也是时间。
df0 <- create_sample_data(n_id = 100L, n_hr = 24L * 365L)
microbenchmark::microbenchmark(
copy = df <- copy(df0),
uwe_dt = {
df <- copy(df0)
setDT(df)[, .(sum_measurem = sum(measurem)),
by = .(ID, date = as.IDate(datetime), AM = hour(datetime) < 12L)]
},
PKumar_dt = {
df <- copy(df0)
datetime <- as.POSIXlt(df$datetime)
date <- as.Date(datetime)
ind <- ifelse(datetime$hour >= 12,"PM","AM")
df$ind <- ind
df$date <- date
dt <- setDT(df)
dt[,list(sum_measure = sum(measurem)),by=list(ID,date,ind)]
},
PKumar_baseR = {
df <- copy(df0)
datetime <- as.POSIXlt(df$datetime)
date <- as.Date(datetime)
ind <- ifelse(datetime$hour >= 12,"PM","AM")
df$ind <- ind
df$date <- date
fin <- aggregate(measurem ~ ID + date + ind, data = df, sum)
fin[order(fin$ID),]
},
times = 11L
)
Unit: milliseconds expr min lq mean median uq max neval copy 3.94761 4.391457 5.169909 5.537982 5.864401 5.997876 11 uwe_dt 271.89460 301.001006 339.913084 312.151541 344.251971 540.018306 11 PKumar_dt 417.57141 464.778485 575.547756 475.562955 689.848696 851.180584 11 PKumar_baseR 6356.93567 6707.847607 6896.174857 6863.069477 6903.442520 8112.316770 11
即使存在中等大小的问题,基本R解决方案的速度也比data.table
版慢。 PKumar data.table
解决方案中的低效数据操作会增加50%的性能损失。此外,不必要地分配了56 MB的额外内存,而df
onyl需要17 MB。