下面是一个脚本,用于获取人员统计信息并在最近6天之前进行滚动平均。我希望最近的日期比以后的日期更具影响力。
如果可能的话:
有两种创建one_df
和two_df
下方的滚动平均值的方法,我在实际脚本中使用第一种方法,但是我添加了第二种方法,以防在权重中写得更容易功能。
library(dplyr)
library(lubridate)
# Create DataFrame
df<- data.frame(name=c('CAREY.FAKE','CAREY.FAKE','CAREY.FAKE','CAREY.FAKE','CAREY.FAKE','CAREY.FAKE','CAREY.FAKE',
'JOHN.SMITH','JOHN.SMITH','JOHN.SMITH','JOHN.SMITH','JOHN.SMITH','JOHN.SMITH','JOHN.SMITH',
'JEFF.JOHNSON','JEFF.JOHNSON','JEFF.JOHNSON','JEFF.JOHNSON','JEFF.JOHNSON','JEFF.JOHNSON','JEFF.JOHNSON',
'SARA.JOHNSON','SARA.JOHNSON','SARA.JOHNSON','SARA.JOHNSON','SARA.JOHNSON','SARA.JOHNSON','SARA.JOHNSON'
),
GA=c(3,2,1,1,2,3,20,3,2,1,1,2,3,20,3,2,1,1,2,3,20,3,2,1,1,2,3,20),
SV=c(3,2,1,1,2,3,20,3,2,1,1,2,3,20,3,2,1,1,2,3,20,3,2,1,1,2,3,20),
GF=c(3,2,1,1,2,3,20,3,2,1,1,2,3,20,3,2,1,1,2,3,20,3,2,1,1,2,3,20),
SA=c(3,2,1,1,2,3,20,3,2,1,1,2,3,20,3,2,1,1,2,3,20,3,2,1,1,2,3,20),
date=c("10/20/2016","10/19/2016","10/18/2016","10/17/2016","10/16/2016","10/15/2016","10/14/2016",
"10/20/2016","10/19/2016","10/18/2016","10/17/2016","10/16/2016","10/15/2016","10/14/2016",
"10/20/2016","10/19/2016","10/18/2016","10/17/2016","10/16/2016","10/15/2016","10/14/2016",
"10/20/2016","10/19/2016","10/18/2016","10/17/2016","10/16/2016","10/15/2016","10/14/2016"
),
stringsAsFactors = FALSE)
one_df <- df %>%
group_by(name) %>%
arrange(name, mdy(date)) %>%
summarise_at(2:5, funs(mean(tail(., 6))))
two_df <- df %>%
group_by(name) %>%
top_n(mdy(date), n = 6) %>%
summarise_at(2:5, mean)
DF:
name GA SV GF SA date
CAREY.FAKE 3 3 3 3 10/20/2016
CAREY.FAKE 2 2 2 2 10/19/2016
CAREY.FAKE 1 1 1 1 10/18/2016
CAREY.FAKE 1 1 1 1 10/17/2016
CAREY.FAKE 2 2 2 2 10/16/2016
CAREY.FAKE 3 3 3 3 10/15/2016
CAREY.FAKE 20 20 20 20 10/14/2016
JOHN.SMITH 3 3 3 3 10/20/2016
JOHN.SMITH 2 2 2 2 10/19/2016
JOHN.SMITH 1 1 1 1 10/18/2016
JOHN.SMITH 1 1 1 1 10/17/2016
JOHN.SMITH 2 2 2 2 10/16/2016
JOHN.SMITH 3 3 3 3 10/15/2016
JOHN.SMITH 20 20 20 20 10/14/2016
JEFF.JOHNSON 3 3 3 3 10/20/2016
JEFF.JOHNSON 2 2 2 2 10/19/2016
JEFF.JOHNSON 1 1 1 1 10/18/2016
JEFF.JOHNSON 1 1 1 1 10/17/2016
JEFF.JOHNSON 2 2 2 2 10/16/2016
JEFF.JOHNSON 3 3 3 3 10/15/2016
JEFF.JOHNSON 20 20 20 20 10/14/2016
SARA.JOHNSON 3 3 3 3 10/20/2016
SARA.JOHNSON 2 2 2 2 10/19/2016
SARA.JOHNSON 1 1 1 1 10/18/2016
SARA.JOHNSON 1 1 1 1 10/17/2016
SARA.JOHNSON 2 2 2 2 10/16/2016
SARA.JOHNSON 3 3 3 3 10/15/2016
SARA.JOHNSON 20 20 20 20 10/14/2016
结果:
name GA SV GF SA
CAREY.FAKE 2 2 2 2
JEFF.JOHNSON 2 2 2 2
JOHN.SMITH 2 2 2 2
SARA.JOHNSON 2 2 2 2
预期结果:
name GA SV GF SA
CAREY.FAKE 2.05 2.05 2.05 2.05
JEFF.JOHNSON 2.05 2.05 2.05 2.05
JOHN.SMITH 2.05 2.05 2.05 2.05
SARA.JOHNSON 2.05 2.05 2.05 2.05
答案 0 :(得分:4)
我相信这种混淆来自于您并不是真正想要移动平均线而是简单的加权平均线的事实:
weights <- c(.5,.5,.3,.3,.2,.2)
df %>%
group_by(name) %>%
arrange(desc(date)) %>% # sort dates ...
slice(1:6) %>% # ... in order to keep only 6 most recent
summarise_at(vars(-date,-name),
~sum(.*weights)/sum(weights)) # apply weighted average
# # A tibble: 4 x 5
# name GA SV GF SA
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 CAREY.FAKE 2.05 2.05 2.05 2.05
# 2 JEFF.JOHNSON 2.05 2.05 2.05 2.05
# 3 JOHN.SMITH 2.05 2.05 2.05 2.05
# 4 SARA.JOHNSON 2.05 2.05 2.05 2.05
答案 1 :(得分:3)
您可以使用TTR软件包中的加权移动平均值WMA
函数获得结果。权重应用于为期间长度(n = 6
)选择的记录记录。权重应与期间的长度相同。
library(dplyr)
library(lubridate)
library(purrr)
df %>%
group_by(name) %>%
arrange(name, mdy(date)) %>%
mutate_at(2:5, TTR::WMA, n = 6, wts = c(.2, .2, .3, .3, .5, .5))
# A tibble: 28 x 6
# Groups: name [4]
name GA SV GF SA date
<chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 CAREY.FAKE NA NA NA NA 10/14/2016
2 CAREY.FAKE NA NA NA NA 10/15/2016
3 CAREY.FAKE NA NA NA NA 10/16/2016
4 CAREY.FAKE NA NA NA NA 10/17/2016
5 CAREY.FAKE NA NA NA NA 10/18/2016
6 CAREY.FAKE 3.50 3.50 3.50 3.50 10/19/2016
7 CAREY.FAKE 2.05 2.05 2.05 2.05 10/20/2016
8 JEFF.JOHNSON NA NA NA NA 10/14/2016
9 JEFF.JOHNSON NA NA NA NA 10/15/2016
10 JEFF.JOHNSON NA NA NA NA 10/16/2016
# ... with 18 more rows
或者将NA过滤掉:
df %>%
group_by(name) %>%
arrange(name, mdy(date)) %>%
mutate_at(2:5, TTR::WMA, n = 6, wts = c(.2, .2, .3, .3, .5, .5)) %>%
filter(!is.na(GA))
# A tibble: 8 x 6
# Groups: name [4]
name GA SV GF SA date
<chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 CAREY.FAKE 3.50 3.50 3.50 3.50 10/19/2016
2 CAREY.FAKE 2.05 2.05 2.05 2.05 10/20/2016
3 JEFF.JOHNSON 3.50 3.50 3.50 3.50 10/19/2016
4 JEFF.JOHNSON 2.05 2.05 2.05 2.05 10/20/2016
5 JOHN.SMITH 3.50 3.50 3.50 3.50 10/19/2016
6 JOHN.SMITH 2.05 2.05 2.05 2.05 10/20/2016
7 SARA.JOHNSON 3.50 3.50 3.50 3.50 10/19/2016
8 SARA.JOHNSON 2.05 2.05 2.05 2.05 10/20/2016
如果周期窗口没有足够的值,我们可以创建一个函数并将其包装在purrr的possible
函数中,以在函数失败时返回NA。在下面的示例中,我从“ CAREY.FAKE”中删除了2条记录以显示结果。
my_func <- function(x){
TTR::WMA(x, n = 6, wts = c(.2, .2, .3, .3, .5, .5))
}
df %>%
group_by(name) %>%
arrange(name, mdy(date)) %>%
mutate_at(2:5, possibly(my_func, otherwise = NA_real_))
# A tibble: 26 x 6
# Groups: name [4]
name GA SV GF SA date
<chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 CAREY.FAKE NA NA NA NA 10/14/2016
2 CAREY.FAKE NA NA NA NA 10/15/2016
3 CAREY.FAKE NA NA NA NA 10/16/2016
4 CAREY.FAKE NA NA NA NA 10/17/2016
5 CAREY.FAKE NA NA NA NA 10/18/2016
6 JEFF.JOHNSON NA NA NA NA 10/14/2016
7 JEFF.JOHNSON NA NA NA NA 10/15/2016
8 JEFF.JOHNSON NA NA NA NA 10/16/2016
9 JEFF.JOHNSON NA NA NA NA 10/17/2016
10 JEFF.JOHNSON NA NA NA NA 10/18/2016
# ... with 16 more rows
答案 2 :(得分:0)
如果您运行代码的一部分,在这里您还可以看到权重。本质上,它的作用与上面的答案相同。
df <- df %>% mutate(distance_to_today = today() - as.Date(date, tryFormats = c("%m/%d/%Y")) ) %>%
arrange(name, distance_to_today) %>%
group_by(name) %>% mutate(rank=rank(distance_to_today)) %>%
mutate(weight=ifelse(rank<=2,0.5,ifelse(rank<=4,0.3,ifelse(rank<=6,0.2,0)))) %>%
group_by(name) %>% summarise(GA=sum(GA*weight)/sum(weight),
SV=sum(SV*weight)/sum(weight), GF=sum(GF*weight)/sum(weight),
SA=sum(SA*weight)/sum(weight))
答案 3 :(得分:0)
似乎有一个简单的解决方案,只需扩展您的初始代码即可:
array(
0 => array(
'parent' => 'parent 1',
'child' => array(
'1' => 'child 1',
'2' => 'child 2'
)
),
1 => array(
'parent' => 'parent 2',
'child' => array(
'1' => 'child 1',
'2' => 'child 2'
)
)
)