我有一组用户的IP地址以及他们访问特定网站的相关时间。我试图改变每个IP地址之间的时间变化。为了简化这一过程,我已为每一行分配了一个标签,表明它是否代表上一行的更改,并且我已按用户进行此操作。
示例数据:
user.nm ip.addr.txt login.sessn.ts change.label
b c 2/18/2013 16:08 FALSE
b c 2/18/2013 16:08 FALSE
b c 2/28/2013 13:37 FALSE
b c 2/28/2013 16:10 FALSE
b c 2/28/2013 16:20 FALSE
b c 3/5/2013 9:29 FALSE
b c 3/6/2013 11:42 FALSE
b c 3/11/2013 13:55 FALSE <-
b b 6/25/2013 13:22 TRUE <-
b d 6/25/2013 13:22 FALSE <-
b b 8/12/2013 13:18 TRUE <-
b c 8/12/2013 13:18 FALSE
b c 8/20/2013 15:13 FALSE
b c 8/20/2013 15:13 FALSE
b c 9/23/2013 14:08 FALSE
b c 9/23/2013 14:09 FALSE
b c 9/25/2013 11:00 FALSE
b c 10/18/2013 16:54 FALSE
b c 10/18/2013 16:54 FALSE
b c 10/30/2013 14:33 FALSE
b c 11/8/2013 15:03 FALSE
b c 11/18/2013 11:30 FALSE
b c 11/18/2013 11:33 FALSE
b c 11/20/2013 16:08 FALSE
b c 11/21/2013 11:51 FALSE
b c 11/21/2013 11:52 FALSE
b c 11/21/2013 15:18 FALSE
b c 11/21/2013 16:40 FALSE
b c 11/21/2013 16:44 FALSE
b c 11/21/2013 16:45 FALSE
b c 11/21/2013 16:45 FALSE
b c 11/29/2013 15:41 FALSE
b c 11/29/2013 15:41 FALSE
a a 1/9/2013 15:32 FALSE
a a 1/9/2013 15:32 FALSE
a a 1/9/2013 15:32 FALSE
a a 1/9/2013 15:32 FALSE
a a 1/10/2013 10:39 FALSE
a a 1/10/2013 10:39 FALSE
a a 1/10/2013 10:39 FALSE
a a 1/11/2013 10:31 FALSE
a a 1/11/2013 10:31 FALSE
a a 1/18/2013 12:30 FALSE
a a 2/22/2013 10:54 FALSE <-
a b 3/6/2013 12:27 TRUE <-
dput:
sample.data=structure(list(user.nm = c("b", "b", "b", "b", "b", "b", "b",
"b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b",
"b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a"),
ip.addr.txt = c("c", "c", "c", "c", "c", "c", "c", "c", "b",
"c", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c",
"c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "b"
), login.sessn.ts = structure(c(1361221680, 1361221680, 1362076620,
1362085800, 1362086400, 1362493740, 1362588120, 1363024500,
1372180920, 1372180920, 1376327880, 1376327880, 1377025980,
1377025980, 1379959680, 1379959740, 1380121200, 1382129640,
1382129640, 1383157980, 1383940980, 1384792200, 1384792380,
1384981680, 1385052660, 1385052720, 1385065080, 1385070000,
1385070240, 1385070300, 1385070300, 1385757660, 1385757660,
1357763520, 1357763520, 1357763520, 1357763520, 1357832340,
1357832340, 1357832340, 1357918260, 1357918260, 1358530200,
1361548440, 1362590820), class = c("POSIXct", "POSIXt"), tzone = ""),
change.label = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, TRUE)), .Names = c("user.nm", "ip.addr.txt",
"login.sessn.ts", "change.label"), row.names = c(NA, -45L), class = "data.frame")
我正在尝试编写一个ddply汇总语句,以便给出每个用户每次IP更改(以及其他内容)之间的时间差异。通常情况下,我只会通过所有标记为TRUE的观察对DF进行子集化,并将其用作我的ddply数据帧。但是,我需要行之间的差异,其中FALSE后面紧跟一个TRUE。
理想情况下,输出数据框如下所示:
user.nm change count min.change.tme max.change.time
a 2 10 sec 4 hours
b 1 1 hour 1 hour
我希望使用某种索引查找功能,例如match,
,但我不确定如何将其转换为函数。
是否有某种&#34;后视&#34; R中的功能可以帮助解决这个问题?
我的代码所以为了获得IP更改的数量到目前为止工作得很好,并且低于:
did.change<-function(vec){
#consumes vector
#returns a p-1 boolean vector of instances where element is not directly repeated (duplicated)
b.vec=head(vec, -1)==tail(vec, -1)
return(!b.vec)
}
###this function works on the ENTIRE list of entries per user, which is to broad
time.changes<-function(vec){
a=head(vec-1)-tail(vec,-1)
return(abs(a))
}
user.changes=ddply(sample.data, c("user.nm"), summarize,
change.count=sum(did.change(ip.addr.txt)))
#max.change.time=max(time.changes(login.sessn.ts)),
#min.change.time=min(time.changes(login.sessn.ts)))
答案 0 :(得分:1)
简答:是的,它被称为diff
!
答案很长:
is_diff <- which(diff(sample.data$change.label)==1)
ss <- do.call(c,lapply(is_diff,function(x) c(x,x+1)))
sample.data[ss,]
user.nm ip.addr.txt login.sessn.ts change.label
8 b c 2013-03-11 10:55:00 FALSE
9 b b 2013-06-25 10:22:00 TRUE
10 b c 2013-06-25 10:22:00 FALSE
11 b b 2013-08-12 10:18:00 TRUE
44 a a 2013-02-22 07:54:00 FALSE
45 a b 2013-03-06 09:27:00 TRUE
以下是计算登录时间变化的一种方法:
ss_list <- lapply(is_diff,function(x) c(x,x+1))
logins <- lapply(ss_list,function(x) sample.data[x,"login.sessn.ts"])
library(lubridate)
lapply(logins,function(x) diff(ymd_hms(x)))
如果您希望将其分解为user.nm
,请尝试使用dplyr
:
library(dplyr)
sample.data %>%
mutate(rownum = 1:nrow(sample.data)) %>%
filter(rownum %in% ss) %>%
group_by(user.nm) %>%
mutate(change = login.sessn.ts - lag(login.sessn.ts))
user.nm ip.addr.txt login.sessn.ts change.label rownum change
1 b c 2013-03-11 10:55:00 FALSE 8 NA days
2 b b 2013-06-25 10:22:00 TRUE 9 9.156420e+06 days
3 b c 2013-06-25 10:22:00 FALSE 10 0.000000e+00 days
4 b b 2013-08-12 10:18:00 TRUE 11 4.146960e+06 days
5 a a 2013-02-22 07:54:00 FALSE 44 NA days
6 a b 2013-03-06 09:27:00 TRUE 45 1.206458e+01 days