我有两个数据框。 df1是一个数据帧,在每组单元中包括多个位置。 df2包括所有位置的每日最高温度(df $ tmax)的每日观测值。对于df1中的每个单位,我想计算平均每日最高费用。每个单元内所有位置的温度。
以下代码生成每个数据帧的示例。我需要将其扩展到约240个单位和8年的每日数据。
R中的这种查找/匹配练习似乎总能吸引我。一定有一种明显的方法可以做到这一点,但目前我受阻,没有加入真正的蛮力等。
df1 <-
structure(list(unitID = c("98008", "98008", "98008", "98008",
"98065", "98065", "98065", "98065", "98146", "98146", "98146",
"98146", "98584", "98584", "98584"), locationID = c("USW00094290", "USW00094248",
"USW00024234", "USC00454169", "USC00458508", "USS0021B60S", "USR0000WFTA",
"USC00451233", "USW00024234", "USW00024233", "USW00094248", "USC00454169",
"USW00094227", "USC00451939", "USC00455086")), class = "data.frame", row.names = c(NA,
-15L))
df1
unitID locationID
1 98008 USW00094290
2 98008 USW00094248
3 98008 USW00024234
4 98008 USC00454169
5 98065 USC00458508
6 98065 USS0021B60S
7 98065 USR0000WFTA
8 98065 USC00451233
9 98146 USW00024234
10 98146 USW00024233
11 98146 USW00094248
12 98146 USC00454169
13 98584 USW00094227
14 98584 USC00451939
15 98584 USC00455086
df2 <-
structure(list(id = c("USW00094290", "USW00094290", "USW00094248",
"USW00094248", "USW00024234", "USW00024234", "USC00454169", "USC00454169",
"USC00458508", "USC00458508", "USS0021B60S", "USS0021B60S", "USR0000WFTA",
"USR0000WFTA", "USC00451233", "USC00451233", "USW00024233", "USW00024233",
"USW00094227", "USW00094227", "USC00451939", "USC00451939", "USC00455086",
"USC00455086"), date = structure(c(17167, 17168, 17167, 17168,
17167, 17168, 17167, 17168, 17167, 17168, 17167, 17168, 17167,
17168, 17167, 17168, 17167, 17168, 17167, 17168, 17167, 17168,
17167, 17168), class = "Date"), tmax = c(28, 28, 28, 28, 33,
28, 33, 28, -11, -28, -17, -50, 11, -17, 0, -11, 28, 11, 44,
33, 50, 39, 39, 28)), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame"))
df2
# A tibble: 24 x 3
id date tmax
<chr> <date> <dbl>
1 USW00094290 2017-01-01 28
2 USW00094290 2017-01-02 28
3 USW00094248 2017-01-01 28
4 USW00094248 2017-01-02 28
5 USW00024234 2017-01-01 33
6 USW00024234 2017-01-02 28
7 USC00454169 2017-01-01 33
8 USC00454169 2017-01-02 28
9 USC00458508 2017-01-01 -11
10 USC00458508 2017-01-02 -28
# ... with 14 more rows
输出应包括unitID,日期和最大平均值。温度
unitID date avg_temp
98008 2009-01-01 30.5
98008 2009-01-02 ...
98008 2009-01-03 ...
答案 0 :(得分:1)
我们可以使用n=5000;
t=15000;
lambda=0.8;
sigmae1=0.05;
sigmae2=0.1;
sigmaz= 0.013;
n_lambda= trunc(Int, lambda*n)
eshocks1=rand(Normal(0.0,sigmae1), n_lambda, t);
eshocks2=rand(Normal(0.0,sigmae2), n - n_lambda, t);
zshocks =rand(Normal(0.0, sigmaz),1, t);
,left_join
group_by
和unitID
并取locationID
中的mean
。
tmax
在基数R中,我们可以使用library(dplyr)
df1 %>%
left_join(df2, by = c("locationID" = "id")) %>%
group_by(unitID, locationID) %>%
summarise(tmx = mean(tmax, na.rm = TRUE))
# unitID locationID tmx
# <chr> <chr> <dbl>
# 1 98008 USC00454169 30.5
# 2 98008 USW00024234 30.5
# 3 98008 USW00094248 28
# 4 98008 USW00094290 28
# 5 98065 USC00451233 -5.5
# 6 98065 USC00458508 -19.5
# 7 98065 USR0000WFTA -3
# 8 98065 USS0021B60S -33.5
# 9 98146 USC00454169 30.5
#10 98146 USW00024233 19.5
#11 98146 USW00024234 30.5
#12 98146 USW00094248 28
#13 98584 USC00451939 44.5
#14 98584 USC00455086 33.5
#15 98584 USW00094227 38.5
和merge
aggregate
答案 1 :(得分:0)
我们可以使用data.table
联接
library(data.table)
setDT(df1)[setDT(df2), on = .(locationID = id)][,
.(tmx = mean(tmax, na.rm = TRUE)), .(unitID, locationID)]
#. unitID locationID tmx
# 1: 98008 USW00094290 28.0
# 2: 98008 USW00094248 28.0
# 3: 98146 USW00094248 28.0
# 4: 98008 USW00024234 30.5
# 5: 98146 USW00024234 30.5
# 6: 98008 USC00454169 30.5
# 7: 98146 USC00454169 30.5
# 8: 98065 USC00458508 -19.5
# 9: 98065 USS0021B60S -33.5
#10: 98065 USR0000WFTA -3.0
#11: 98065 USC00451233 -5.5
#12: 98146 USW00024233 19.5
#13: 98584 USW00094227 38.5
#14: 98584 USC00451939 44.5
#15: 98584 USC00455086 33.5