解析R中的时间并提取符合特定条件的元素

时间:2018-02-11 05:01:05

标签: r datetime

我有一个包含2列的大型数据框的子集,看起来像这样

c1                      c2
gym1               Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM
gym2               Wed:7:00 AM-4:00 PM
gym3               Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM
gym4               Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM
gym5               Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM

c1包含健身房名称,c2包含一周中的日期以及以字符格式打开的时间。

我将如何解析c2和1.发现哪些健身房开放时间超过x小时2.发现哪些健身房在上午9点后打开?我猜我会在数据框的末尾添加两列,其值为TRUE或FALSE,但我不知道如何达到这一点。

非常感谢任何帮助或指导。谢谢。

3 个答案:

答案 0 :(得分:0)

以下示例非常冗长,当然可以简化。但是,我相信它符合您的要求。

它假定您的日期都符合以下(格式不一致)的方式。

它返回一个更大的数据框,然后提供:

每天的开放时间
 每天关闭时间
 每天健身房开放的持续时间
 如果健身房在特定日期开放超过x小时
 最后,它确定健身房是否在任何一天上午9点(或上午9点)开放  最后它确定健身房在任何一天的开放时间是否大于x小时

df<-data.frame(c1=c("gym1","gym2","gym3","gym4","gym5"),c2=c("Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM",
    "Wed:7:00 AM-4:00 PM",
    "Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM",
    "Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM",
    "Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM"))

# Remove white space to standardise
df$c2 <- gsub(" +","",df$c2)

# standardise time into hh:mm
df$c2 <- gsub(":([1-9]):",":0\\1:",df$c2)
df$c2 <- gsub("-([1-9]):","-0\\1:",df$c2)
# Lowercase the text
df$c2 <- tolower(df$c2 )

library(stringr)

# Open for greater than x hours
x <- 7

for(i in c("mon","tue","wed","thu","fri","sat","sun")) {
    tmp <- data.frame(stringr::str_locate(df$c2,i))["end"]
    df <- cbind(df,tmp)
    df[,paste0(i,"_open")] <- NA
    df[,paste0(i,"_closed")] <- NA
    df[!is.na(df$end),paste0(i,"_open")] <- str_sub(df$c2[!is.na(df$end)],df$end[!is.na(df$end)]+2,df$end[!is.na(df$end)]+8)
    df[!is.na(df$end),paste0(i,"_closed")] <- str_sub(df$c2[!is.na(df$end)],df$end[!is.na(df$end)]+10,df$end[!is.na(df$end)]+16)

    df[,paste0(i,"_duration")] <- NA
    df[,paste0(i,"_duration")] <-
    as.numeric(difftime(
        strptime(df[,paste0(i,"_closed")], "%I:%M%p" ),
        strptime(df[,paste0(i,"_open")], "%I:%M%p" ),
        units='hours')
    )
    # open for greater than x?
    df[,paste0(i,"_open_greater_than_x_hours")] <- FALSE
    df[which(df[,paste0(i,"_duration")] >= x),paste0(i,"_open_greater_than_x_hours")] <- TRUE

    # open after 9 am?
    df[,paste0(i,"_open_after_9am")] <- FALSE
    df[,paste0(i,"_open_after_9am")] <- strptime(df[,paste0(i,"_open")], "%I:%M%p" )  >= strptime("09:00am", "%I:%M%p" ) 

    df$end <- NULL
}

# Determine if a gym opens after (or at) 9am on at least one day
df$any_day_open_after_9am <- rowSums(df[,names(df)[grepl("after_9",names(df))]],na.rm = T) > 1

# Determine if a gym is open for greater than x hours on at least one day
df$open_greater_than_x_hours <- rowSums(df[,names(df)[grepl("open_greater_tha",names(df))]],na.rm = T) > 1

答案 1 :(得分:0)

第一步是将数据转换为更整洁的格式。

@UIApplicationMain class AppDelegate: UIResponder, UIApplicationDelegate { var persistentContainer: NSPersistentContainer! var window: UIWindow? func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool { createContainer { container in self.persistentContainer = container let storyboard = self.window?.rootViewController?.storyboard guard let vc = storyboard?.instantiateViewController(withIdentifier: "RootViewController") as? RootViewController else { fatalError("Cannot instantiate root view controller") } vc.managedObjectContext = container.viewContext self.window?.rootViewController = vc } return true } func createContainer(completion: @escaping (NSPersistentContainer) -> ()) { let container = NSPersistentContainer(name: "MyDataModel") container.loadPersistentStores { _, error in guard error == nil else { fatalError("Failed to load store: \(error)") } DispatchQueue.main.async { completion(container) } } } } 天数进入字符向量的列表列。然后strsplit每天都有一行。

然后更多的清洁。

unnest

一旦整洁,只需将library("lubridate") library("tidyverse") df <- read_table("c1 c2 gym1 Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM gym2 Wed:7:00 AM-4:00 PM gym3 Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM gym4 Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM gym5 Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM ") df_tidied <- df %>% mutate(c2 = strsplit(c2, ";")) %>% unnest %>% separate(c2, c("open_day", "times"), sep = ":", extra = "merge") %>% mutate(times = gsub(" ", "", times)) %>% separate(times, c("open_time", "close_time"), sep = "-") %>% mutate( open_time = parse_date_time(open_time, "%I:%M%p"), close_time = coalesce( parse_date_time(close_time, "%I:%M%p"), open_time + hours(1)), opening_hours = close_time - open_time) df_tidied #> # A tibble: 10 x 5 #> c1 open_day open_time close_time opening_hours #> <chr> <chr> <dttm> <dttm> <time> #> 1 gym1 Thu 0000-01-01 08:00:00 0000-01-01 22:30:00 14.5 #> 2 gym1 Fri 0000-01-01 08:00:00 0000-01-01 21:00:00 13 #> 3 gym2 Wed 0000-01-01 07:00:00 0000-01-01 16:00:00 9 #> 4 gym3 Mon 0000-01-01 12:00:00 0000-01-01 18:00:00 6 #> 5 gym3 Tue 0000-01-01 12:00:00 0000-01-01 19:00:00 7 #> 6 gym3 Wed 0000-01-01 10:00:00 0000-01-01 11:00:00 1 #> 7 gym4 Sat 0000-01-01 08:00:00 0000-01-01 22:30:00 14.5 #> 8 gym4 Sun 0000-01-01 08:00:00 0000-01-01 21:00:00 13 #> 9 gym5 Sat 0000-01-01 08:00:00 0000-01-01 22:30:00 14.5 #> 10 gym5 Sun 0000-01-01 08:00:00 0000-01-01 12:00:00 4 tallyopening_hours

c1

答案 2 :(得分:0)

df=dat%>%tidytext::unnest_tokens(word, c2, token = strsplit, split = ";")%>%
    separate(word,c("day","open_time","close_time"),"(?<=[a-z]):|-")%>%
    mutate(duration=strptime(close_time,"%I:%M %p")-strptime(open_time,"%I:%M %p"))
  df
   c1 day open_time close_time   duration
1 gym1 thu  8:00 am    10:30 pm 14.5 hours
2 gym1 fri  8:00 am     9:00 pm 13.0 hours
3 gym2 wed   7:00 am    4:00 pm  9.0 hours
4 gym3 mon 12:00 pm     6:00 pm  6.0 hours
5 gym3 tue 12:00 pm     7:00 pm  7.0 hours
6 gym3 wed 10:00 am    10:00 pm 12.0 hours
7 gym4 sat  8:00 am    10:30 pm 14.5 hours
8 gym4 sun  8:00 am     9:00 pm 13.0 hours

df%>%group_by(c1)%>%
     mutate(openafter9=as.numeric(format(strptime(open_time,"%I:%M %p"),"%I"))>9,
             Tot_Hrs_opn=sum(duration))##You can decide to use summarize but remember opening hour may depend on the day so you need to be careful
# A tibble: 8 x 7
# Groups:   c1 [4]
     c1   day open_time close_time   duration openafter9 Tot_Hrs_opn
  <chr> <chr>     <chr>      <chr>     <time>      <lgl>      <time>
1  gym1   thu  8:00 am    10:30 pm 14.5 hours      FALSE  27.5 hours
2  gym1   fri  8:00 am     9:00 pm 13.0 hours      FALSE  27.5 hours
3  gym2   wed   7:00 am    4:00 pm  9.0 hours      FALSE   9.0 hours
4  gym3   mon 12:00 pm     6:00 pm  6.0 hours       TRUE  25.0 hours
5  gym3   tue 12:00 pm     7:00 pm  7.0 hours       TRUE  25.0 hours
6  gym3   wed 10:00 am    10:00 pm 12.0 hours       TRUE  25.0 hours
7  gym4   sat  8:00 am    10:30 pm 14.5 hours      FALSE  27.5 hours
8  gym4   sun  8:00 am     9:00 pm 13.0 hours      FALSE  27.5 hours