我有一个大数据集,下面给出了一个样本:
df <- data.frame(stringsAsFactors=FALSE,
Date = c("2015-10-26", "2015-10-26", "2015-10-26", "2015-10-26",
"2015-10-27", "2015-10-27", "2015-10-27"),
Ticker = c("ANZ", "CBA", "NAB", "WBC", "ANZ", "CBA", "WBC"),
Open = c(29.11, 77.89, 32.69, 31.87, 29.05, 77.61, 31.84),
High = c(29.17, 77.93, 32.76, 31.92, 29.08, 78.1, 31.95),
Low = c(28.89, 77.37, 32.42, 31.71, 28.9, 77.54, 31.65),
Close = c(28.9, 77.5, 32.42, 31.84, 28.94, 77.74, 31.77),
Volume = c(6350170L, 2251288L, 3804239L, 5597684L, 5925519L, 2424679L,
5448863L)
)
我希望最后一个值为缺少的日期重复一遍:
Date Ticker Open High Low Close Volume
2 2015-10-27 NAB 32.69 32.76 32.42 32.42 3804239
关于如何做到这一点的任何想法?
我已尝试gather
+ spread
答案 0 :(得分:4)
如果你尝试过类似的东西怎么办?
LOCF
我假设如果不存在Ticker / Day组合,你想创建一个和expand.grid
它。这就是{{1}}的作用。
答案 1 :(得分:3)
tidyr::complete
和tidyr::fill
仅针对这种情况构建:
library(tidyverse)
df %>%
complete(Date,Ticker) %>%
arrange(Ticker) %>%
fill(names(.)) %>%
arrange(Date)
#
# # A tibble: 8 x 7
# Date Ticker Open High Low Close Volume
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
# 1 2015-10-26 ANZ 29.11 29.17 28.89 28.90 6350170
# 2 2015-10-26 CBA 77.89 77.93 77.37 77.50 2251288
# 3 2015-10-26 NAB 32.69 32.76 32.42 32.42 3804239
# 4 2015-10-26 WBC 31.87 31.92 31.71 31.84 5597684
# 5 2015-10-27 ANZ 29.05 29.08 28.90 28.94 5925519
# 6 2015-10-27 CBA 77.61 78.10 77.54 77.74 2424679
# 7 2015-10-27 NAB 32.69 32.76 32.42 32.42 3804239
# 8 2015-10-27 WBC 31.84 31.95 31.65 31.77 5448863
答案 2 :(得分:1)
另一个可能的解决方案(注意:我必须将您的日期向量转换为日期格式,但这可以在最终输出中反转):
library(tidyr)
library(dplyr)
df <- data.frame(stringsAsFactors=FALSE,
Date = as.Date(c("2015-10-26", "2015-10-26", "2015-10-26", "2015-10-26",
"2015-10-27", "2015-10-27", "2015-10-27")),
Ticker = c("ANZ", "CBA", "NAB", "WBC", "ANZ", "CBA", "WBC"),
Open = c(29.11, 77.89, 32.69, 31.87, 29.05, 77.61, 31.84),
High = c(29.17, 77.93, 32.76, 31.92, 29.08, 78.1, 31.95),
Low = c(28.89, 77.37, 32.42, 31.71, 28.9, 77.54, 31.65),
Close = c(28.9, 77.5, 32.42, 31.84, 28.94, 77.74, 31.77),
Volume = c(6350170L, 2251288L, 3804239L, 5597684L, 5925519L, 2424679L,
5448863L))
tickers<- unique(df$Ticker)
dates<- as.Date(df$Date)
possibilities<- as.data.frame(unique(expand.grid(dates,tickers)))
colnames(possibilities)<- c('Date','Ticker')
missing<- anti_join(possibilities,df[,c('Date','Ticker')])
missing_filled<- if(nrow(missing)>0){
replacement<- cbind(missing,filter(df,Date==missing$Date-1,Ticker==missing$Ticker)[,3:7])
}
final<- arrange(rbind(df,replacement),Date)