actual_date=c('2018-01-03','2018-01-02','2018-01-25','2018-01-15','2018-01-06','2018-01-02','2018-01-16','2018-01-22','2018-01-03')
date_band=c('_201801','_201801','_201803','_201802','_201801', '_201801','_201803','_201804','_201801')
action=c('text','letter','call','letter','visit','letter','text','text','call')
unique_ref=c(1,1,2,1,2,3,3,4,4)
df1=as.data.frame(cbind(unique_ref,actual_date,date_band,action))
unique_ref=c(1,2,3,4)
priority_201801=c('3','2','3','0')
balance_201801=c('30','-20','35','-100')
priority_201802=c('1','1','1','2')
balance_201802=c('60','-40','35','0')
priority_201803=c('2','2','3','2')
balance_201803=c('30','-40','-50','100')
priority_201804=c('99','0','0','0')
balance_201804=c('0','-20','-50','-100')
df2=as.data.frame(cbind(unique_ref,priority_201801,balance_201801,priority_201802,balance_201802,priority_201803,
balance_201803,priority_201804,balance_201804))
上面的代码产生了我正在使用的两个数据集示例。
df1看起来像这样:
unique_ref actual_date date_band action
1 1 2018-01-03 _201801 text
2 1 2018-01-02 _201801 letter
3 2 2018-01-25 _201803 call
4 1 2018-01-15 _201802 letter
5 2 2018-01-06 _201801 visit
6 3 2018-01-02 _201801 letter
7 3 2018-01-16 _201803 text
8 4 2018-01-22 _201804 text
9 4 2018-01-03 _201801 call
df2如下:
unique_ref priority_201801 balance_201801 priority_201802 balance_201802 priority_201803 balance_201803 priority_201804 balance_201804
1 1 3 30 1 60 2 30 99 0
2 2 2 -20 1 -40 2 -40 0 -20
3 3 3 35 1 35 3 -50 0 -50
4 4 0 -100 2 0 2 100 0 -100
我想做的是在df2中添加一列,指出action_dateband(即action_201801,action_201802等)。这将从df1中使用date_band的操作中获取,并在unqiue_ref上进行匹配。
所需的输出如下:如果一个星期有两个,则两个动作之间会出现逗号。
unique_ref priority_201801 balance_201801 action_201801 priority_201802 balance_201802 action_201802 priority_201803 balance_201803 action_201803
1 1 3 30 text,letter 1 60 letter 2 30
2 2 2 -20 visit 1 -40 2 -40 call
3 3 3 35 letter 1 35 3 -50 text
4 4 0 -100 call 2 0 2 100
priority_201804 balance_201804 action_201804
1 99 0
2 0 -20
3 0 -50
4 0 -100 text
答案 0 :(得分:2)
library(tidyverse)
df2 %>%
left_join(df1, by=c("unique_ref")) %>% # join df1 to df2
select(-actual_date) %>% # remove column you won't need
mutate(date_band = paste0("action", date_band)) %>% # update column values
spread(date_band, action) # reshape to get the format you want
# unique_ref priority_201801 balance_201801 priority_201802 balance_201802 priority_201803 balance_201803
# 1 1 3 30 1 60 2 30
# 2 2 2 -20 1 -40 2 -40
# 3 3 3 35 1 35 3 -50
# 4 4 0 -100 2 0 2 100
# priority_201804 balance_201804 action_201801 action_201802 action_201803 action_201804
# 1 99 0 text letter <NA> <NA>
# 2 0 -20 visit <NA> call <NA>
# 3 0 -50 letter <NA> text <NA>
# 4 0 -100 call <NA> <NA> text
对于特定的ref
和date_band
有多个操作的情况,可以使用以下方法:
library(tidyverse)
# update df1
df1_upd = df1 %>%
group_by(unique_ref, date_band) %>% # for every combination of ref and date_band
summarise(action = paste0(action, collapse = ",")) %>% # combine actions
ungroup() # forget the grouping
df2 %>%
left_join(df1_upd, by=c("unique_ref")) %>%
mutate(date_band = paste0("action", date_band)) %>%
spread(date_band, action)
# unique_ref priority_201801 balance_201801 priority_201802 balance_201802 priority_201803 balance_201803
# 1 1 3 30 1 60 2 30
# 2 2 2 -20 1 -40 2 -40
# 3 3 3 35 1 35 3 -50
# 4 4 0 -100 2 0 2 100
# priority_201804 balance_201804 action_201801 action_201802 action_201803 action_201804
# 1 99 0 text,letter letter <NA> <NA>
# 2 0 -20 visit <NA> call <NA>
# 3 0 -50 letter <NA> text <NA>
# 4 0 -100 call <NA> <NA> text