在两个df
s上使用rain
的{{1}}每日值year
(2003年和2004年)
site
和set.seed(123)
date <- rep(as.Date(seq(as.Date("2003-01-01"),
as.Date("2004-12-31"), by = 1),
format="%Y-%m-%d"), 2)
site <- c(rep("Site_1", 731), rep("Site_2", 731))
rain <- c(as.numeric(sample(1.1e6:87e6, 731, replace=T)),
as.numeric(sample(4.5e5:56e5, 731, replace=T)))
df <- data.frame(date, site, rain)
每月测量两个参数df1
和param_A
。对于这两个参数,有时每天有多个测量值。
param_B
我将两个data.frames(set.seed(123)
date <- rep(c(as.Date("2003-01-05"),as.Date("2003-02-12"), as.Date("2003-03-07"), as.Date("2003-04-07"),
as.Date("2003-05-13"),as.Date("2003-06-17"), as.Date("2003-07-16"), as.Date("2003-08-23"),
as.Date("2003-09-24"),as.Date("2003-10-18"), as.Date("2003-11-08"), as.Date("2003-12-28"),
rep(as.Date("2003-03-01"), 3), rep(as.Date("2003-05-14"), 3), rep(as.Date("2003-08-21"), 3),
as.Date("2004-01-14"),as.Date("2004-02-02"), as.Date("2004-03-17"), as.Date("2004-04-03"),
as.Date("2004-05-05"),as.Date("2004-06-17"), as.Date("2004-07-04"), as.Date("2004-08-08"),
as.Date("2004-09-16"),as.Date("2004-10-24"), as.Date("2004-11-09"), as.Date("2004-12-07"),
rep(as.Date("2004-09-07"), 4), rep(as.Date("2004-10-27"), 2), rep(as.Date("2004-12-14"), 4)), 4)
site <- c(rep("Site_1", 86), rep("Site_2", 86))
parameter <- rep(c(rep("param_A", 43), rep("param_B", 43)), 2)
value <- c(runif(43, 0.05, 2.3)/1e6,
runif(43, 0.03, 3.1)/1e6,
runif(43, 0.004, 0.09)/1e6,
runif(43, 0.001, 0.07)/1e6)
df1 <- data.frame(date, site, parameter, value)
和df
)合并为一个data.frame df1
df2
对于每个网站,我想计算每个df2 <- merge(df, df1, by= c("date", "site"), all=T)
,对于每个year
,一个变量让它命名为parameter
,等于
saturation
(sum(rain*value)/sum(rain)) for days where value is not NA * sum(rain for whole year)
在这两个网站中的每一个网站上,library(dplyr)
df_fin <- df2 %>%
mutate(year = factor(format(date, "%Y"))) %>%
arrange(site, year, parameter) %>%
group_by(site, year, parameter ) %>%
summarise(sum_rain = sum(rain),
specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])) %>%
mutate(saturation = sum_rain * specific_days)
和param_A
在几天内每天都会进行多次测量。因此,在计算公式中param_B
的总和时,会添加每天的重复降雨值,每年增加rain
。
有没有办法每天保持一个降雨价值,而不是将这些重复的每日价值加起来?
更新
更具体地说,sum(daily rain values)
使用sum()
的每日rain
值将如下所示
df
在library(dplyr)
df_1 <- df %>%
mutate(year = factor(format(date, "%Y"))) %>%
arrange(site, year) %>%
group_by(site, year) %>%
summarise(sum_rain = sum(rain))
df_1
#Source: local data frame [4 x 3]
#Groups: site [?]
# site year sum_rain
# (fctr) (fctr) (dbl)
#1 Site_1 2003 16039370227
#2 Site_1 2004 16170122189
#3 Site_2 2003 1078275774
#4 Site_2 2004 1090111362
中,df_fin
与上述不同
sum_rain
我希望df_fin <- df2 %>%
mutate(year = factor(format(date, "%Y"))) %>%
arrange(site, year, parameter) %>%
group_by(site, year, parameter ) %>%
summarise(sum_rain = sum(rain),
specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])) %>%
mutate(saturation = sum_rain * specific_days)
df_fin
site year parameter sum_rain specific_days saturation
(fctr) (fctr) (fctr) (dbl) (dbl) (dbl)
1 Site_1 2003 param_A 916392248 1.342250e-06 1230.027083
2 Site_1 2003 param_B 916392248 1.263414e-06 1157.783068
3 Site_1 2003 NA 15413670047 NaN NaN
4 Site_1 2004 param_A 949808653 1.253945e-06 1191.007549
5 Site_1 2004 param_B 949808653 1.517257e-06 1441.104187
6 Site_1 2004 NA 15623884439 NaN NaN
7 Site_2 2003 param_A 59690558 5.113758e-08 3.052431
8 Site_2 2003 param_B 59690558 3.592397e-08 2.144322
9 Site_2 2003 NA 1038708704 NaN NaN
10 Site_2 2004 param_A 76792434 4.508143e-08 3.461913
11 Site_2 2004 param_B 76792434 2.958886e-08 2.272201
12 Site_2 2004 NA 1035805843 NaN NaN
中sum_rain
与df_fin
中的{{1>} 相同 1>
答案 0 :(得分:1)
可能有帮助
library(dplyr)
library(lubridate)
df %>%
group_by(site, Year = year(date)) %>%
mutate(sum_rain = sum(rain)) %>%
full_join(., df1, by = c('date', 'site')) %>%
filter(!is.na(parameter)) %>%
group_by(parameter, add=TRUE) %>%
summarise(sum_rain = first(sum_rain),
specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])) %>%
mutate(saturation = sum_rain * specific_days)
# site Year parameter sum_rain specific_days saturation
# <fctr> <dbl> <fctr> <dbl> <dbl> <dbl>
#1 Site_1 2003 param_A 16039370227 0.00000134224955 21528.83747
#2 Site_1 2003 param_B 16039370227 0.00000126341430 20264.36967
#3 Site_1 2004 param_A 16170122189 0.00000125394472 20276.43941
#4 Site_1 2004 param_B 16170122189 0.00000151725738 24534.23720
#5 Site_2 2003 param_A 1078275774 0.00000005113758 55.14042
#6 Site_2 2003 param_B 1078275774 0.00000003592397 38.73595
#7 Site_2 2004 param_A 1090111362 0.00000004508143 49.14378
#8 Site_2 2004 param_B 1090111362 0.00000002958886 32.25515
如果我们也想要NA行,请删除filter
步骤
df %>%
group_by(site, Year = year(date)) %>%
mutate(sum_rain = sum(rain)) %>%
full_join(., df1, by = c('date', 'site')) %>%
group_by(parameter, add=TRUE) %>%
summarise(sum_rain = first(sum_rain),
specific_days = sum(rain*value,
na.rm=TRUE)/sum(rain[!is.na(value)])) %>%
mutate(saturation = sum_rain * specific_days)
# site Year parameter sum_rain specific_days saturation
# <fctr> <dbl> <fctr> <dbl> <dbl> <dbl>
#1 Site_1 2003 param_A 16039370227 0.00000134224955 21528.83747
#2 Site_1 2003 param_B 16039370227 0.00000126341430 20264.36967
#3 Site_1 2003 NA 16039370227 NaN NaN
#4 Site_1 2004 param_A 16170122189 0.00000125394472 20276.43941
#5 Site_1 2004 param_B 16170122189 0.00000151725738 24534.23720
#6 Site_1 2004 NA 16170122189 NaN NaN
#7 Site_2 2003 param_A 1078275774 0.00000005113758 55.14042
#8 Site_2 2003 param_B 1078275774 0.00000003592397 38.73595
#9 Site_2 2003 NA 1078275774 NaN NaN
#10 Site_2 2004 param_A 1090111362 0.00000004508143 49.14378
#11 Site_2 2004 param_B 1090111362 0.00000002958886 32.25515
#12 Site_2 2004 NA 1090111362 NaN NaN