我有一个数据框,该数据框按年份-月份收集了数据,但有时一个月没有收集任何数据。这样
df <- read.table(textConnection("car,year,month,amount
Mazda,2012,02,2344
Ford,2012,04,235234
Mazda,2012,03,3455
Mazda,2012,04,43554
Mazda,2012,05,9854
Mazda,2012,06,32556
Ford, 2013,01,2345"), sep = ",", header = TRUE)
> df
car year month amount
1 Mazda 2012 2 2344
2 Ford 2012 4 235234
3 Mazda 2012 3 3455
4 Mazda 2012 4 43554
5 Mazda 2012 5 9854
6 Mazda 2012 6 32556
7 Ford 2013 1 2345
我想用这种方式填补空白(给定两个日期,“从” = 2012/01/01,“至” = 2013/12/01):
car year month amount
Mazda 2012 1 0
Ford 2012 1 0
Ford 2012 2 0
Mazda 2012 2 2344
Ford 2012 3 0
Ford 2012 4 235234
Mazda 2012 3 3455
Mazda 2012 4 43554
Mazda 2012 5 9854
Mazda 2012 6 32556
Mazda 2012 7 0
Mazda 2012 8 0
Mazda 2012 9 0
Mazda 2012 10 0
Mazda 2012 11 0
Mazda 2012 12 0
Ford 2013 1 2345
Ford 2013 2 0
Ford 2013 3 0
Ford 2013 4 0
Ford 2013 5 0
Ford 2013 6 0
Ford 2013 7 0
Ford 2013 8 0
Ford 2013 9 0
Ford 2013 10 0
Ford 2013 11 0
Ford 2013 12 0
Mazda 2013 1 0
Mazda 2013 2 0
Mazda 2013 3 0
Mazda 2013 4 0
Mazda 2013 5 0
Mazda 2013 6 0
Mazda 2013 7 0
Mazda 2013 8 0
Mazda 2013 9 0
Mazda 2013 10 0
Mazda 2013 11 0
Mazda 2013 12 0
我的第一个想法是以这种方式生成“日期数据框”作为序列
min.date <- as.Date("2012/01/01")
max.date <- as.Date("2013/12/01")
gen.dates <-
seq(from = mid.date,
to = max.date,
by = "month") %>% as.data.frame()
然后加入JOIN,但是我发现它并不像看起来那样简单,所以我认为有一种更简洁的方法也许是使用dplyr
答案 0 :(得分:3)
扩大我的评论:
read.table(textConnection("car,year,month,amount
Mazda,2012,02,2344
Ford,2012,04,235234
Mazda,2012,03,3455
Mazda,2012,04,43554
Mazda,2012,05,9854
Mazda,2012,06,32556
Ford,2013,01,2345"),
sep = ",", header = TRUE, stringsAsFactors = FALSE) -> xdf
重量级tidyverse
的方式:
dplyr::glimpse(
tidyr::complete(xdf, car = unique(car), year = unique(year), month=1:12, fill=list(amount=0))
)
## Observations: 48
## Variables: 4
## $ car <chr> "Ford", "Ford", "Ford", "Ford", "Ford", "Ford", "Ford", "Ford", "Ford", "Ford", "Ford", "Ford", "For...
## $ year <int> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013...
## $ month <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8...
## $ amount <dbl> 0, 0, 0, 235234, 0, 0, 0, 0, 0, 0, 0, 0, 2345, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2344, 3455, 43554...
所有基本R:
merge(
expand.grid(car = unique(xdf$car), year = unique(xdf$year), month=1:12),
xdf, by = c("car", "year", "month"), all.x = TRUE
) -> xdf
xdf$amount <- ifelse(is.na(xdf$amount), 0, xdf$amount)
dplyr::glimpse(xdf)
## Observations: 48
## Variables: 4
## $ car <fct> Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Mazda, Ma...
## $ year <int> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013...
## $ month <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8...
## $ amount <dbl> 0, 2344, 3455, 43554, 9854, 32556, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23...
头对头:
microbenchmark::microbenchmark(
tidy = tidyr::complete(xdf, car = unique(car), year = unique(year), month=1:12, fill=list(amount=0)),
dplyr = xdf %>%
group_by(car, year) %>%
complete(month = 1:12, fill = list(amount = 0)),
base = {
merge(
expand.grid(car = unique(xdf$car), year = unique(xdf$year), month=1:12),
xdf, by = c("car", "year", "month"), all.x = TRUE
) -> x2
x2$amount <- ifelse(is.na(x2$amount), 0, x2$amount)
}
)
## Unit: microseconds
## expr min lq mean median uq max neval
## tidy 2553.802 3036.262 4233.912 3613.672 5046.737 12219.712 100
## dplyr 5639.261 6851.680 9396.590 7686.171 10273.043 70357.399 100
## base 848.400 1055.845 1593.015 1194.247 1656.759 9594.898 100
您也可以进行扩展(例如,根据您的要求进行扩展):
tidyr::complete(xdf, car = unique(car), year = 2012:2014, month=1:12, fill=list(amount=0))
或
merge(
expand.grid(car = unique(xdf$car), year =2012:2014, month=1:12),
xdf, by = c("car", "year", "month"), all.x = TRUE
) -> x2
x2$amount <- ifelse(is.na(x2$amount), 0, x2$amount)
然后,在其他元数据中分层:
read.table(textConnection("car,year,month,amount
Mazda,2012,02,2344
Ford,2012,04,235234
Mazda,2012,03,3455
Mazda,2012,04,43554
Mazda,2012,05,9854
Mazda,2012,06,32556
Ford,2013,01,2345"),
sep = ",", header = TRUE, stringsAsFactors = FALSE) -> xdf
merge(
expand.grid(car = unique(xdf$car), year =2012:2014, month=1:12),
xdf, by = c("car", "year", "month"), all.x = TRUE
) -> x2
x2$amount <- ifelse(is.na(x2$amount), 0, x2$amount)
data.frame(
car = c("Mazda", "Ford"),
country = c("JP", "US"),
stringsAsFactors = FALSE
) -> car2country_df
merge(x2, car2country_df)
或通过tidyverse
:
tidyr::complete(
xdf, car = unique(car), year = 2012:2014, month=1:12, fill=list(amount=0)
) %>%
dplyr::left_join(car2country_df)
答案 1 :(得分:0)
检出tidyr
软件包和complete
。使用fill = list(amount = 0))
来填写缺失的值,如下面的@markus所示。
library(tidyr)
library(dplyr)
df %>% group_by(car,year) %>% complete(month = 1:12, fill = list(amount = 0))
# A tibble: 48 x 4
# Groups: car, year [4]
# car year month amount
# <fct> <int> <int> <dbl>
# 1 " Ford" 2012 1 0
# 2 " Ford" 2012 2 0
# 3 " Ford" 2012 3 0
# 4 " Ford" 2012 4 235234
# 5 " Ford" 2012 5 0
# 6 " Ford" 2012 6 0
# 7 " Ford" 2012 7 0
# 8 " Ford" 2012 8 0
# 9 " Ford" 2012 9 0
#10 " Ford" 2012 10 0
# ... with 38 more rows