我有一年中有40天的数据和一些数据
set.seed(123)
df <- data.frame(day = 1:40,rain = runif(40,min = 0, max = 3), petc = runif(40, min = 0.3, max = 8),swc = runif(40, min = 27.01, max = 117.43))
我想计算每天另一个名为aetc的变量,计算方法如下:
SW.ini <- 2 # setting some initial values
SW.max <- 5
SW.min <- 0
第1天,
1)确定名为PAW(day1) = SW.ini + rain(day1)
2)如果PAW(day1) >= SWC(day1), aetc(day1) = petc(day1)
;
If `PAW(day1) < SWC(day1), aetc(day1) = PAW(day1)/SWC(day1) * petc(day1)`
3)检查是否aetc(day1) > PAW(day1). If yes, aetc(day1) = paw(day1)
4)更新SW(day1) = SW.ini + rain(day1) - aetc(day1)
5)如果SW(day1) > SW.max, SW(day1) = SW.max. Similarly if
SW(第1天)&lt; SW.min,SW(day1)= SW.min`
重复第2天
1)确定PAW(day2) = SW(day1) + rain(day2)
2)如果PAW(day2) >= SWC(day2), aetc(day2) = petc(day2)
;
如果PAW(day2) < SWC(day2), aetc(day2) = PAW(day2)/SWC(day2) * petc(day2)
3)检查是否aetc(day2) > PAW(day2)
。如果是,aetc(day2) = paw(day2)
4)更新SW(day2) = SW(day1) + rain(day2) - aetc(day2)
5)如果SW(day2) > SW.max, SW(day2) = SW.max. Similarly if
SW(第2天)&lt; SW.min,
SW(第2天)= SW.min`
这是我优雅的for循环:
df$PAW <- NA
df$aetc <- NA
df$SW <- NA
df$PAW[1] <- SW.ini + df$rain[1]
df$aetc[1] <- ifelse(df$PAW[1] >= df$swc[1], df$petc[1],(df$PAW[1]/df$swc[1])*df$petc[1])
df$aetc[1] <- ifelse(df$aetc[1] > df$PAW[1], df$PAW[1], df$aetc[1])
df$SW[1] <- SW.ini + df$rain[1] - df$aetc[1]
df$SW[1] <- ifelse(df$SW[1] > SW.max, SW.max, ifelse(df$SW[1] < 0, 0,df$SW[1]))
for (day in 2:nrow(df)){
df$PAW[day] <- df$SW[day - 1] + df$rain[day]
df$aetc[day] <- ifelse(df$PAW[day] >= df$swc[day], df$petc[day], (df$PAW[day]/df$swc[day]) * df$petc[day])
df$aetc[day] <- ifelse(df$aetc[day] > df$PAW[day], df$PAW[day],df$aetc[day])
df$SW[day] <- df$SW[day - 1] + df$rain[day] - df$aetc[day]
df$SW[day] <- ifelse(df$SW[day] > SW.max,SW.max, ifelse(df$SW[day] < 0, 0,df$SW[day]))
}
我的问题是,这只是一年的数据,我想要运行多年。
set.seed(123)
df <- data.frame(year = 1980:2015, day = rep(1:40, each = 36),rain =
runif(40*36,min = 0, max = 3), petc = runif(40*36, min = 0.3, max = 8),swc = runif(40*36, min = 27.01, max = 117.43))
所以我想做一些像
这样的事情 df %>% group_by(year) # and then run the above function for each year.
是否有dplyr或任何其他解决方案?
谢谢
答案 0 :(得分:5)
注意:我最初在你的后续问题R: for loop within a foreach loop上发布了这个答案,但在看到这个问题后,似乎这个答案在这里更为重要。 (我不会在我的回答中解决与并行化相关的任何问题,这是您跟进的主题)。
Rcpp
和data.table
使用C ++编译逻辑并使用data.table分组操作按组应用它可以从基线开始加速~2,000倍,远远超过您希望通过并行化获得的速度。
在您的原始示例中, 39,420,000行,这将在我的计算机上以 1.883秒执行;在 28,800行的修订后的版本中,执行 0.004秒
library(data.table)
library(Rcpp)
在R脚本中内联定义并编译C++
函数CalcSW()
:
一个注意事项:C
/ C++
中的计数从0
开始,与R
不同,后者从1
开始 - &#39 ;为什么这里的指数不同
Rcpp::cppFunction('
List CalcSW(NumericVector SW_ini,
NumericVector SW_max,
NumericVector rain,
NumericVector swc,
NumericVector PETc) {
int n = SW_ini.length();
NumericVector SW(n);
NumericVector PAW(n);
NumericVector aetc(n);
double SW_ini_glob = SW_ini[0];
double SW_max_glob = SW_max[0];
SW[0] = SW_ini_glob;
PAW[0] = SW[0] + rain[0];
if (PAW[0] > swc[0]){
aetc[0] = PETc[0];
} else {
aetc[0] = PAW[0]/swc[0]*PETc[0];
}
if (aetc[0] > PAW[0]){
aetc[0] = PAW[0];
}
SW[0] = SW[0] + rain[0] - aetc[0];
if(SW[0] > SW_max_glob){
SW[0] = SW_max_glob;
}
if(SW[0] < 0){
SW[0] = 0;
}
for (int i = 1; i < n; i++) {
PAW[i] = SW[i-1] + rain[0];
if (PAW[i] > swc[i]){
aetc[i] = PETc[i];
} else {
aetc[i] = PAW[i]/swc[i]*PETc[i];
}
if (aetc[i] > PAW[i]){
aetc[i] = PAW[i];
}
SW[i] = SW[i-1] + rain[i] - aetc[i];
if(SW[i] > SW_max_glob){
SW[i] = SW_max_glob;
}
if(SW[i] < 0){
SW[i] = 0;
}
}
return Rcpp::List::create(Rcpp::Named("SW") = SW,
Rcpp::Named("PAW") = PAW,
Rcpp::Named("aetc") = aetc);
}')
创建data.table
df <- data.table(loc.id = rep(1:10, each = 80*36),
year = rep(rep(1980:2015, each = 80), times = 10),
day = rep(rep(1:80, times = 36),times = 10),
rain = runif(10*36*80, min = 0 , max = 5),
swc = runif(10*36*80,min = 0, max = 50),
SW_max = rep(runif(10, min = 100, max = 200), each = 80*36),
SW_ini = runif(10*36*80),
PETc = runif(10*36*80, min = 0 , max = 1.3),
SW = as.numeric(NA),
PAW = as.numeric(NA),
aetc = as.numeric(NA))
setkey(df, loc.id, year, day)
对CalcSW()
和df
的每个组合在loc.id
执行函数year
,同时将返回值分配给三列:
system.time({
df[, c("SW","PAW","aetc") := CalcSW(SW_ini,
SW_max,
rain,
swc,
PETc), keyby = .(loc.id, year)]
})
...
user system elapsed
0.004 0.000 0.004
结果:
head(df)
...
loc.id year day rain swc SW_max SW_ini PETc SW PAW aetc
1: 1 1980 1 0.35813251 28.360715 177.3943 0.69116310 0.2870478 1.038675 1.049296 0.01062025
2: 1 1980 2 1.10331116 37.013022 177.3943 0.02742273 0.4412420 2.125335 1.396808 0.01665171
3: 1 1980 3 1.76680011 32.509970 177.3943 0.66273062 1.1071233 3.807561 2.483467 0.08457420
4: 1 1980 4 3.20966558 8.252797 177.3943 0.12220454 0.3496968 6.840713 4.165693 0.17651342
5: 1 1980 5 1.32498191 14.784203 177.3943 0.66381497 1.2168838 7.573160 7.198845 0.59253503
6: 1 1980 6 0.02547458 47.903637 177.3943 0.21871598 1.0864713 7.418750 7.931292 0.17988449
我并非100%肯定我完全实现了你的逻辑,但逻辑应该非常简单地调整我可能错过的东西,我以非常类似的方式实现它。 / p>
另一个注意事项:通过自动缩进和代码突出显示(无论您是否使用RStudio或Emacs),您可以更轻松地编写C++
您创建一个单独的文件,名称类似TestCode.cpp
格式如下。
然后,您可以使用Rcpp::sourceCpp("TestCode.cpp")
在R脚本中编译函数,也可以将除前三行之外的所有内容复制并粘贴为Rcpp::cppFunction()
的参数。就像我上面做的那样。
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
List CalcSW(NumericVector SW_ini,
NumericVector SW_max,
NumericVector rain,
NumericVector swc,
NumericVector PETc) {
int n = SW_ini.length();
NumericVector SW(n);
NumericVector PAW(n);
NumericVector aetc(n);
double SW_ini_glob = SW_ini[0];
double SW_max_glob = SW_max[0];
SW[0] = SW_ini_glob;
PAW[0] = SW[0] + rain[0];
if (PAW[0] > swc[0]){
aetc[0] = PETc[0];
} else {
aetc[0] = PAW[0]/swc[0]*PETc[0];
}
if (aetc[0] > PAW[0]){
aetc[0] = PAW[0];
}
SW[0] = SW[0] + rain[0] - aetc[0];
if(SW[0] > SW_max_glob){
SW[0] = SW_max_glob;
}
if(SW[0] < 0){
SW[0] = 0;
}
for (int i = 1; i < n; i++) {
PAW[i] = SW[i-1] + rain[0];
if (PAW[i] > swc[i]){
aetc[i] = PETc[i];
} else {
aetc[i] = PAW[i]/swc[i]*PETc[i];
}
if (aetc[i] > PAW[i]){
aetc[i] = PAW[i];
}
SW[i] = SW[i-1] + rain[i] - aetc[i];
if(SW[i] > SW_max_glob){
SW[i] = SW_max_glob;
}
if(SW[i] < 0){
SW[i] = 0;
}
}
return Rcpp::List::create(Rcpp::Named("SW") = SW,
Rcpp::Named("PAW") = PAW,
Rcpp::Named("aetc") = aetc);
}
答案 1 :(得分:1)
您可以将代码包装在另一个for循环中,并将每年df保存在列表中:
color
答案 2 :(得分:1)
来自Matt的data.table
插图很好地说明了data.table
的速度有多快,因为它可以在没有副本和移动数据的情况下进行计算。
但是,要回答关于使用管道的问题的关键,您可以使用group_by
和do
来完成您的目标(虽然比data.table
慢得多)
下面我设置了Matt所做的相同虚拟数据。然后我使用你的函数(但在PETc
上修复了案例)。它并不快,但它很容易遵循。
df <- data.frame(loc.id = rep(1:10, each = 80*36),
year = rep(rep(1980:2015, each = 80), times = 10),
day = rep(rep(1:80, times = 36),times = 10),
rain = runif(10*36*80, min = 0 , max = 5),
swc = runif(10*36*80,min = 0, max = 50),
SW_max = rep(runif(10, min = 100, max = 200), each = 80*36),
SW_ini = runif(10*36*80),
PETc = runif(10*36*80, min = 0 , max = 1.3)
)
my_fun <- function(df){
SW.ini <- 2 # setting some initial values
SW.max <- 5
SW.min <- 0
df$PAW <- NA
df$aetc <- NA
df$SW <- NA
df$PAW[1] <- SW.ini + df$rain[1]
df$aetc[1] <- ifelse(df$PAW[1] >= df$swc[1], df$PETc[1],(df$PAW[1]/df$swc[1])*df$PETc[1])
df$aetc[1] <- ifelse(df$aetc[1] > df$PAW[1], df$PAW[1], df$aetc[1])
df$SW[1] <- SW.ini + df$rain[1] - df$aetc[1]
df$SW[1] <- ifelse(df$SW[1] > SW.max, SW.max, ifelse(df$SW[1] < 0, 0,df$SW[1]))
for (day in 2:nrow(df)){
df$PAW[day] <- df$SW[day - 1] + df$rain[day]
df$aetc[day] <- ifelse(df$PAW[day] >= df$swc[day], df$PETc[day], (df$PAW[day]/df$swc[day]) * df$PETc[day])
df$aetc[day] <- ifelse(df$aetc[day] > df$PAW[day], df$PAW[day],df$aetc[day])
df$SW[day] <- df$SW[day - 1] + df$rain[day] - df$aetc[day]
df$SW[day] <- ifelse(df$SW[day] > SW.max,SW.max, ifelse(df$SW[day] < 0, 0,df$SW[day]))
}
return(df)
}
library(tictoc)
library(tidyverse)
tic()
df %>%
group_by(year) %>%
do(my_fun(.)) ->
out
toc()
#> 5.075 sec elapsed