我有以下时间序列数据:
Date duration Volume
1 1-Jul 12am-2am 0.80
2 1-Jul 2am-4am 0.80
3 1-Jul 4am-6am 0.80
4 1-Jul 6am-8am 5.00
5 1-Jul 8am-10am 14.00
6 1-Jul 10am-12pm 3.40
7 1-Jul 12pm-2pm 0.80
8 1-Jul 2pm-4pm 0.80
9 1-Jul 4pm-6pm 2.40
10 1-Jul 6pm-8pm 12.00
11 1-Jul 8pm-10pm 14.00
12 1-Jul 10pm-12am 3.40
13 1-Jul 12am-2am 0.60
14 1-Jul 2am-4am 0.60
15 1-Jul 4am-6am 0.60
16 1-Jul 6am-8am 5.50
17 1-Jul 8am-10am 14.00
18 1-Jul 10am-12pm 4.00
19 1-Jul 12pm-2pm 0.80
20 1-Jul 2pm-4pm 0.65
21 1-Jul 4pm-6pm 6.30
22 1-Jul 6pm-8pm 19.50
23 1-Jul 8pm-10pm 19.45
24 1-Jul 10pm-12am 9.00
我想将“日期”和“持续时间”列转换为R日期格式。另外,是否可以将这两列合并为一个“ date_time”列,以便使用auto.arima()
进行预测。
谢谢
答案 0 :(得分:2)
想挑战自己,偶然发现了这个问题,这是我的解决方案。
#first lets create the sample data
date <- c("1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul")
duration <- c("12am-2am","2am-4am","4am-6am","6am-8am","8am-10am","10am-12pm","12pm-2pm","2pm-4pm","4pm-6pm","6pm-8pm","8pm-10pm","10pm-12am","12am-2am","2am-4am","4am-6am","6am-8am","8am-10am","10am-12pm","12pm-2pm","2pm-4pm","4pm-6pm","6pm-8pm","8pm-10pm","10pm-12am")
volume <- c("0.80","0.80","0.80","5.00","14.00","3.40","0.80","0.80","2.40","12.00","14.00","3.40","0.60","0.60","0.60","5.50","14.00","4.00","0.80","0.65","6.30","19.50","19.45","9.00")
df <- data.frame(date,duration,volume, stringsAsFactors = F)
bla <- t(as.data.frame(sapply(df$duration, strsplit, "-")))
rownames(bla) <- c(1:nrow(bla))
default_year <- "2020"
#seperate the timestamps in start and end times
df <- cbind(df,bla)
#add current year to the date and make it a as.Date
df$date <- as.Date(paste0(default_year,"-",df$date),format='%Y-%d-%b')
#convert "am" and "pm" to 24h mode
df$`1` <- gsub("12am","00:00",df$`1`)
df$`1` <- gsub("am",":00",df$`1`)
df$`1`[grep("pm",df$`1`)] <- paste0(ifelse(as.numeric(gsub("pm","",df$`1`[grep("pm",df$`1`)]))==12,12,as.numeric(gsub("pm","",df$`1`[grep("pm",df$`1`)]))+12),":00")
df$`2` <- gsub("12am","00:00",df$`2`)
df$`2` <- gsub("am",":00",df$`2`)
df$`2`[grep("pm",df$`2`)] <- paste0(ifelse(as.numeric(gsub("pm","",df$`2`[grep("pm",df$`2`)]))==12,12,as.numeric(gsub("pm","",df$`2`[grep("pm",df$`2`)]))+12),":00")
#paste date and time vectors together
df$t_start <- paste0(df$date,"-",df$`1`)
df$t_end <- paste0(df$date,"-",df$`2`)
#make them posix
df$t_start <- as.POSIXct(df$t_start, format='%Y-%d-%m-%H:%M')
df$t_end <- as.POSIXct(df$t_end, format='%Y-%d-%m-%H:%M')-1
#save it
new_df <- data.frame(df$t_start,df$t_end,df$volume)
new_df
df.t_start df.t_end df.volume
1 2020-01-07 00:00:00 2020-01-07 01:59:59 0.80
2 2020-01-07 02:00:00 2020-01-07 03:59:59 0.80
3 2020-01-07 04:00:00 2020-01-07 05:59:59 0.80
4 2020-01-07 06:00:00 2020-01-07 07:59:59 5.00
5 2020-01-07 08:00:00 2020-01-07 09:59:59 14.00
6 2020-01-07 10:00:00 2020-01-07 11:59:59 3.40
7 2020-01-07 12:00:00 2020-01-07 13:59:59 0.80
8 2020-01-07 14:00:00 2020-01-07 15:59:59 0.80
9 2020-01-07 16:00:00 2020-01-07 17:59:59 2.40
10 2020-01-07 18:00:00 2020-01-07 19:59:59 12.00
11 2020-01-07 20:00:00 2020-01-07 21:59:59 14.00
12 2020-01-07 22:00:00 2020-01-06 23:59:59 3.40
13 2020-02-07 00:00:00 2020-02-07 01:59:59 0.60
14 2020-02-07 02:00:00 2020-02-07 03:59:59 0.60
15 2020-02-07 04:00:00 2020-02-07 05:59:59 0.60
16 2020-02-07 06:00:00 2020-02-07 07:59:59 5.50
17 2020-02-07 08:00:00 2020-02-07 09:59:59 14.00
18 2020-02-07 10:00:00 2020-02-07 11:59:59 4.00
19 2020-02-07 12:00:00 2020-02-07 13:59:59 0.80
20 2020-02-07 14:00:00 2020-02-07 15:59:59 0.65
21 2020-02-07 16:00:00 2020-02-07 17:59:59 6.30
22 2020-02-07 18:00:00 2020-02-07 19:59:59 19.50
23 2020-02-07 20:00:00 2020-02-07 21:59:59 19.45
24 2020-02-07 22:00:00 2020-02-06 23:59:59 9.00