我有一个400列的数据框,其中散布着多个日期列。 在下面的代表性示例中,我想实现以下目标:
将包含日期的任何列转换为POSIXct,无论是因子,字符还是日期
set.seed(123)
df1 <- data.frame(
A = as.numeric(1:10),
B = sample(seq(as.POSIXct('2000/01/01'), as.POSIXct('2018/01/01'), by="day"), size=10),
C = as.numeric(sample(20:90, size = 10)),
D = sample(c("yes", "no"), size=10, replace = TRUE),
E = as.factor(sample(1000:2000, size = 10)),
F = as.character(c("test","test2","test3","test4","test5","test6","test7","test8","test9","test10")),
G = as.factor(c("test","test2","test3","test4","test5","test6","test7","test8","test9","test10")),
H = as.character(sample(seq(as.POSIXct('2000/01/01'), as.POSIXct('2018/01/01'), by="day"), size=10)),stringsAsFactors=FALSE
)
df1
A B C D E F G H
1 1 2005-03-06 00:00:00 87 no 1963 test test 2002-07-27 23:00:00
2 2 2014-03-11 00:00:00 51 no 1902 test2 test2 2007-06-17 23:00:00
3 3 2007-05-11 23:00:00 66 no 1690 test3 test3 2007-06-11 23:00:00
4 4 2015-11-22 00:00:00 58 no 1793 test4 test4 2006-08-20 23:00:00
5 5 2016-12-02 00:00:00 26 no 1024 test5 test5 2002-09-27 23:00:00
6 6 2000-10-26 00:00:00 79 no 1475 test6 test6 2002-06-30 23:00:00
7 7 2009-06-30 23:00:00 35 no 1754 test7 test7 2004-03-11 00:00:00
8 8 2016-01-19 00:00:00 22 no 1215 test8 test8 2008-05-17 23:00:00
9 9 2009-11-30 00:00:00 40 yes 1315 test9 test9 2004-10-12 00:00:00
10 10 2008-03-17 00:00:00 85 yes 1229 test10 test10 2015-06-03 23:00:00
unlist(lapply(df1, class))
A B1 B2 C D E F G H
"numeric" "POSIXct" "POSIXt" "numeric" "character" "factor" "character" "factor" "character"
到目前为止,我已经尝试了以下内容(但它没有保留POSIXct列B)或将字符日期列(列H)转换为POSIXct:
df1_clean <- df1 %>% mutate_all(funs(type.convert(as.character(.), as.is = TRUE)))
unlist(lapply(df1_clean, class))
A B C D E F G H
"integer" "character" "integer" "character" "integer" "character" "character" "character"
对于这个小数据集,我可以调用列并将B和H转换为带有lubridate的POSIXct,但我希望它自动跨越数据帧。
任何帮助将不胜感激! 谢谢 萌
答案 0 :(得分:0)
它可能不是最优雅的方式 - 但它似乎对我有用。
#install.packages("tidyverse")
#install.packages("dataCompareR")
library("tidyverse")
library("dataCompareR")
# create reproducible df
set.seed(123)
df1 <- data.frame(
A = as.numeric(1:10),
B = sample(seq(as.POSIXct('2000/01/01', tz = "UTC"), as.POSIXct('2018/01/01', tz = "UTC"), by="day"), size=10),
C = as.numeric(sample(20:90, size = 10)),
D = sample(c("yes", "no"), size=10, replace = TRUE),
E = as.factor(sample(1000:2000, size = 10)),
F = as.character(c("test","test2","test3","test4","test5","test6","test7","test8","test9","test10")),
G = as.factor(c("test","test2","test3","test4","test5","test6","test7","test8","test9","test10")),
H = as.character(sample(seq(as.POSIXct('2000/01/01', tz = "UTC"), as.POSIXct('2018/01/01', tz = "UTC"), by="day"), size=10)),stringsAsFactors=FALSE
)
df1 #look at df
unlist(lapply(df1, class)) #look at df classes
df1_clean <- df1 %>% mutate_all(funs(type.convert(as.character(.), as.is = TRUE))) #reassign classes by running type.convert (input are all variables from the df but as.character)
unlist(lapply(df1_clean, class)) #look at df classes now
#check if a column is a Date - https://stackoverflow.com/questions/18178451/is-there-a-way-to-check-if-a-column-is-a-date-in-r
tmp=sapply(df1_clean, function(x) !all(is.na(as.Date(as.character(x),format="%Y-%m-%d", tz = "UTC"))))
# if tmp is True, change according column to as.POSIXct
for (i in 1:ncol(df1_clean)){
if (tmp[i] == T){
df1_clean[,i]<- as.POSIXct(df1_clean[,i], tz = "UTC")
}
}
df1_clean #look at df
unlist(lapply(df1_clean, class)) #look at df classes
comp <- rCompare(df1, df1_clean) #compare your dfs before and after using the dataCompareR package
summary(comp) # check summary