使用子组完成缺少的时间序列数据

时间:2018-06-18 20:47:20

标签: python r missing-data

我有以下数据集:

在Python中创建df:

df = pd.DataFrame([['04-01','04-02','04-02','04-03','04-05']['Red','Blue', 'Yellow','Red', 'Blue'], [1,2,1,1,2], [10,20,10,10,20]]).T
df.columns = ['date','color','quant','revenue'] 

或者在R:

date = c('04-01','04-02','04-02','04-03','04-05')
color = c('Red', 'Blue', 'Yellow','Red', 'Blue')
quant = c(1,2,1,1,2)
revenue = c(10,20,10,10,20)
df = data.frame(date, color, quant, revenue)

DF:

date    color quant revenue
04-01     Red     1      10
04-02    Blue     2      20
04-02  Yellow     1      10
04-03     Red     1      10
04-05    Blue     2      20

我想完成每种颜色(04-01)的所有缺失日期(从04-05开始到Red, Blue, Yellow),在0列填写quantrevenue,输出如下:

date    color quant revenue
04-01     Red     1      10
04-01    Blue     0       0
04-01  Yellow     0       0
04-02     Red     0       0
04-02    Blue     2      20
04-02  Yellow     1      10
04-03     Red     1      10
04-03    Blue     0       0
04-03  Yellow     0       0
04-04     Red     0       0
04-04    Blue     0       0
04-04  Yellow     0       0
04-05     Red     0       0
04-05    Blue     2      20
04-05  Yellow     0       0

3 个答案:

答案 0 :(得分:0)

以下是使用tidyverseR的选项,我们separate日期'进入数字' day' '一年'列,然后使用complete展开数据集,并使用unite日,'年'创造日期'柱

library(tidyverse)
df %>% 
    separate(date, into = c('day', 'year'), convert = TRUE) %>% 
    complete(day, year = min(year):max(year), color,
               fill = list(quant = 0, revenue = 0)) %>%
    mutate_at(vars(day, year), funs(sprintf('%02d', .))) %>%
    unite(date, day, year, sep='-')
# A tibble: 15 x 4
#   date  color  quant revenue
#   <chr> <fct>  <dbl>   <dbl>
# 1 04-01 Blue       0       0
# 2 04-01 Red        1      10
# 3 04-01 Yellow     0       0
# 4 04-02 Blue       2      20
# 5 04-02 Red        0       0
# 6 04-02 Yellow     1      10
# 7 04-03 Blue       0       0
# 8 04-03 Red        1      10
# 9 04-03 Yellow     0       0
#10 04-04 Blue       0       0
#11 04-04 Red        0       0
#12 04-04 Yellow     0       0
#13 04-05 Blue       2      20
#14 04-05 Red        0       0
#15 04-05 Yellow     0       0

答案 1 :(得分:0)

使用complete()包中的tidyr函数可以轻松完成此操作。例如

library(tidyr)
df %>% 
  mutate(date=factor(date, levels=sprintf("04-%02d", 1:5))) %>% 
  complete(date, color, fill=list(quant=0, revenue=0))

#    date  color  quant revenue
#    <fct> <fct>  <dbl>   <dbl>
#  1 04-01 Blue       0       0
#  2 04-01 Red        1      10
#  3 04-01 Yellow     0       0
#  4 04-02 Blue       2      20
#  5 04-02 Red        0       0
#  6 04-02 Yellow     1      10
#  7 04-03 Blue       0       0
#  8 04-03 Red        1      10
#  9 04-03 Yellow     0       0
# 10 04-04 Blue       0       0
# 11 04-04 Red        0       0
# 12 04-04 Yellow     0       0
# 13 04-05 Blue       2      20
# 14 04-05 Red        0       0
# 15 04-05 Yellow     0       0

答案 2 :(得分:0)

基础R的解决方案,没有明确按日期排序:

df2 <- expand.grid(date = paste0("04-0", 1:5), 
                   color = c('Red', 'Blue', 'Yellow')) 
df2 <- merge(df2, df, by = c("date", "color"), all.x = TRUE) 
df2$quant[is.na(df2$quant)] <- 0 
df2$revenue[is.na(df2$revenue)] <- 0

df2
#    date  color quant revenue 
# 1  04-01   Blue     0       0 
# 2  04-01    Red     1      10 
# 3  04-01 Yellow     0       0 
# 4  04-02   Blue     2      20 
# 5  04-02    Red     0       0 
# 6  04-02 Yellow     1      10 
# 7  04-03   Blue     0       0 
# 8  04-03    Red     1      10 
# 9  04-03 Yellow     0       0 
# 10 04-04   Blue     0       0 
# 11 04-04    Red     0       0 
# 12 04-04 Yellow     0       0 
# 13 04-05   Blue     2      20 
# 14 04-05    Red     0       0 
# 15 04-05 Yellow     0       0