我有一个看起来像这样的数据框:
可复制数据:
structure(list(User = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Jibran", class = "factor"),
Event = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("IN",
"OUT"), class = "factor"), Time = c("04/15/2015 00:31", "04/16/2015 20:10",
"04/21/2015 14:59", "04/22/2015 01:01", "04/22/2015 10:46",
"04/23/2015 00:58", "04/23/2015 14:50", "04/24/2015 01:37",
"04/25/2015 01:01", "04/27/2015 00:57", "04/17/2015 10:32",
"04/29/2015 15:03", "05/01/2015 00:44", "05/02/2015 01:19",
"05/02/2015 15:08", "05/03/2015 01:08", "05/03/2015 15:06",
"05/04/2015 01:01", "05/04/2015 15:11", "05/05/2015 01:08"
)), row.names = c(NA, -20L), class = "data.frame")
我要查找的值是事件发生变化的值, 例如:
用户|大事记时间
吉布兰2015年4月21日14:59 吉布兰OUT 4/22/2015 1:01 .. ..当下一个事件不同时发生的值(仅当事件具有In后跟Out时才具有值)
我解决此问题的方法是:
x = read.csv("TimeLog2.csv",header=TRUE)
df <- data.frame(matrix(ncol = 3, nrow = 0))
names(df)[1]<-paste("UserName")
names(df)[2]<-paste("Login")
names(df)[3]<-paste("Logout")
for(i in 1:length(x$Event))
{
if(x$Event[[i]]== 'IN' && x$Event[[i+1]]== 'OUT'){
df$Login[[i]]<-(x$Time[[i]])
df$Logout[[i]]<-(x$Time[[i+1]])
}
}
哪个返回:
$<-.data.frame
(*tmp*
,“登录”,值= c(NA,NA,4L))中的错误:
替换有3行,数据有0
所需的输出应类似于:
需要确保的是,只有在事件更改发生在同一天或第二天(按日期)时,才应将事件更改写入下一个数据帧,以获取准确的登录/注销值。
答案 0 :(得分:1)
数据
df = structure(list(User = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Jibran", class = "factor"),
Event = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("IN",
"OUT"), class = "factor"), Time = structure(c(9L, 10L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 11L, 20L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L), .Label = c("05/01/2015 00:44", "05/02/2015 01:19",
"05/02/2015 15:08", "05/03/2015 01:08", "05/03/2015 15:06",
"05/04/2015 01:01", "05/04/2015 15:11", "05/05/2015 01:08",
"4/15/2015 0:31", "4/16/2015 20:10", "4/17/2015 10:32", "4/21/2015 14:59",
"4/22/2015 1:01", "4/22/2015 10:46", "4/23/2015 0:58", "4/23/2015 14:50",
"4/24/2015 1:37", "4/25/2015 1:01", "4/27/2015 0:57", "4/29/2015 15:03"
), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
解决方案
library(dplyr)
library(tidyverse)
library(data.table)
df %>%
mutate(Time = mdy_hm(Time)) %>% # update to date variables
group_by(id = rleid(Event)) %>% # create a grouping variable
filter((Event == "IN" & Time == max(Time)) | # keep max time for IN
(Event == "OUT" & Time == min(Time))) %>% # keep min time for OUT
ungroup() %>% # forget the grouping
mutate(id = cumsum(Event == "IN")) %>% # create a new grouping variable
spread(Event, Time) %>% # reshape data
filter(ceiling(difftime(OUT, IN, units="days")) < 2) %>% # exclude cases where difference in time is 2+ days
select(-id) # remove grouping variable
# # A tibble: 6 x 3
# User IN OUT
# <fct> <dttm> <dttm>
# 1 Jibran 2015-04-21 14:59:00 2015-04-22 01:01:00
# 2 Jibran 2015-04-22 10:46:00 2015-04-23 00:58:00
# 3 Jibran 2015-04-23 14:50:00 2015-04-24 01:37:00
# 4 Jibran 2015-05-02 15:08:00 2015-05-03 01:08:00
# 5 Jibran 2015-05-03 15:06:00 2015-05-04 01:01:00
# 6 Jibran 2015-05-04 15:11:00 2015-05-05 01:08:00
答案 1 :(得分:1)
#df1 <-
# read.csv2("TimeLog2.csv", sep = ",")[,1:3]
library(data.table)
df1$Time2 <- df1$Time %>% as.Date(., format = "%m/%d/%Y", tz = 'GMT')
df1$grp <- shift(cumsum(df1$Event == "OUT"), 1 , 0)
setDT(df1)[, dataDiff := c(.SD$Time2[-.N] - .SD$Time2[.N] > -2 , F) ,by=grp]
df1 <- df1[, .SD[as.logical(cumsum(.SD$dataDiff)),], by=grp][,`:=`(dataDiff = NULL, Time2 = NULL)][]
dcast(df1, User + grp ~ Event)[,`:=`(grp = NULL)][]
结果:
# User IN OUT
#1: Jibran 4/21/2015 14:59 4/22/2015 1:01
#2: Jibran 4/22/2015 10:46 4/23/2015 0:58
#3: Jibran 4/23/2015 14:50 4/24/2015 1:37
#4: Jibran 5/2/2015 15:08 5/3/2015 1:08
#5: Jibran 5/3/2015 15:06 5/4/2015 1:01
#6: Jibran 5/4/2015 15:11 5/5/2015 1:08