Question

我想知道用户访问时发生了什么profile.views。用户由对uid，state唯一标识。该数据存储在两个数据帧中。

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))

> visits
    id  uid state ts
1 2001 1001    CA 51
2 2002 1002    CA 52
3 2003 1001    CA 53
4 2004 1001    MA 54

> profile.views
id  uid state ts
1 3001 1001    CA 51
2 3002 1003    CA 57
3 3003 1002    CA 59
4 3004 1001    CA 59

对于每个profile.view，我想弄清楚它来自哪个访问。这是通过回顾最近访问的匹配的uid和状态，其中ts小于或等于profile.views行上的ts。

以下是我想要的结果（以某种形式）：

profile.views [1，]来自访问[1，]

profile.views [2，]并非来自任何访问（这可能是由数据记录错误引起的）

profile.views [3，]来自访问[2，]

profile.views [4，]来自访问[3，]

有没有人知道这样做的好方法？

Answer 1

在sqldf中使用SQL样式语法：

library(sqldf)
sqldf("
SELECT a.id, a.uid, a.state, a.ts, MAX(b.ts) AS visit_ts
FROM \"profile.views\" AS a
LEFT OUTER JOIN visits AS b
ON a.uid = b.uid
AND a.state = b.state
AND a.ts >= b.ts
GROUP BY a.id, a.uid, a.state, a.ts
ORDER BY a.id
")

Answer 2

更快data.table方式，将个人资料视图与访问ID匹配：

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))
visits <- data.table(visits)
profile.views <- data.table(profile.views)
setkey(visits,uid,state,ts)
#orders columns so that joins are on first three columns
setcolorder(profile.views,c("uid","state","ts","id"))
##set names to avoid name collision
setnames(profile.views,c("uid","state","view.ts","view.id"))
##rolling join
visits[profile.views,roll=TRUE]
    # uid state ts   id view.id
# 1: 1001    CA 51 2001    3001
# 2: 1003    CA 57   NA    3002
# 3: 1002    CA 59 2002    3003
# 4: 1001    CA 59 2003    3004

Answer 3

这是一个data.table解决方案。有一些事情可能会做得更好，但这是第一次通过它。

library(data.table)
visits <- data.table(visits)
profile.views <- data.table(profile.views)
##renames some columns to avoid name collision
##there's probably a better solution to this
setnames(profile.views,c("id","ts"),c("view.id","view.ts"))
setkey(visits,uid,state)
setkey(profile.views,uid,state)
##outer joins visits to profile.views by uid and state
##leaving NA if a row in profile.views has no matches
#visits[profile.views] 
##filters out rows where views happen before visits
#visits[profile.views][view.ts >= ts | is.na(ts)] 
##picks the latest visit timestamp by view
visits[profile.views][view.ts >= ts | is.na(ts), 
  list(visit.ts=max(ts)), 
  by=list(view.id,uid,state,view.ts)][order(view.id)]
#    view.id  uid state view.ts visit.ts
# 1:    3001 1001    CA      51       51
# 2:    3002 1003    CA      57       NA
# 3:    3003 1002    CA      59       52
# 4:    3004 1001    CA      59       53

Answer 4

使用基础R的merge和aggregate：

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))
##merges data frames based on uid and state
newdf.merged <- merge(visits,profile.views, by=c("uid","state"),all.y=TRUE)
##puts unmatched rows into another dataset
newdf.na <- with(newdf,newdf[is.na(ts.x),])
##filters views that happened after visits (like WHERE)
newdf.filter <- with(newdf,newdf[ts.y >= ts.x,])
##aggregates using the max function, selecting max id and ts
newdf.agg <- aggregate(cbind(id.y,ts.y) ~ uid + state + id.x + ts.x, data = newdf.filter, FUN = max)
##merges aggregated result and na rows
newdf.final <- rbind(newdf.agg,newdf.na)
##optional ordering step
newdf.final <- newdf.final[with(newdf.final,order(uid,state,id.x)),]

从两个R数据帧之间的动作时间获取访问时间

4 个答案: