在一定条件下提取最近的条目

时间:2012-05-10 12:04:40

标签: r indexing split plyr data.table

我正在努力完成以下工作:

示例数据集:

   belongID   uniqID   Time   Rating  
   1           101       5      0  
   1           102       4      0  
   2           103       4      0  
   2           104       3      0  
   2           105       2      5
   3           106       4      2  
   3           107       5      0  
   3           108       5      1 

问题是: 我想提取每个belongsID的最新条目(时间的最大值),除非此评级为0.如果最近条目的评级为0。我希望第一个条目具有评级(不是最高评级,只是第一个评级为非零的值)。如果所有其他条目也为零,则需要选择最近的条目。

最终结果应该是:

   belongID   uniqID   Time   Rating  
   1           101       5      0  
   2           105       2      5
   3           108       5      1  

数据集非常大,按belongsID排序。它不是按时间排序的,因此更新的条目可能会在具有相同属性ID的旧条目之后出现。

没有“0 Rating”约束,我使用以下函数计算最近的条目:

>uniqueMax <- function(m, belongID = 1, time = 3) {
    t(
      vapply(
         split(1:nrow(m), m[,belongID]), 
         function(i, x, time) x[i, , drop=FALSE][which.max(x[i,time]),], m[1,], x=m, time=time
      )
    )
 }

我不知道如何纳入“0评级”限制。

编辑:后续问题:

是否有人知道getRating函数如果不仅仅被评为零,如果需要考虑更多的评分(例如0,1,4和5)?因此,除非评级为0或1或4或5,否则分配给最近的?如果评级为0,1,4,5,则分配给具有不同评级的最近一个条目。如果所有等级都是0,1,4或5分配给最近的那些。我尝试了以下方法,但这不起作用:

getRating <- function(x){
  iszero <- x$Rating == 0 | x$Rating == 1 | x$Rating == 4 | x$Rating ==5
  if(all(iszero)){
    id <- which.max(x$Time)
  } else {
    id <- which.max((!iszero)*x$Time) 
            # This trick guarantees taking 0 into account
  }
  x[id,]
}
# Do this over the complete data frame
do.call(rbind,lapply(split(Data,Data$belongID),getRating)) 
     # edited per Tyler's suggestion'

4 个答案:

答案 0 :(得分:4)

这是我对它的破解(有趣的问题):

阅读您的数据:

m <- read.table(text="belongID   uniqID   Time   Rating  
   1           101       5      0  
   1           102       4      0  
   2           103       4      0  
   2           104       3      0  
   2           105       2      5
   3           106       4      2  
   3           107       5      0  
   3           108       5      1 ", header=T)

提取您要求的行:

m2 <- m[order(m$belongID, -m$Time), ]                 #Order to get max time first
LIST <- split(m2, m$belongID)                         #split by belongID
FUN <- function(x) which(cumsum(x[, 'Rating'])!=0)[1] #find first non zero Rating
LIST2 <- lapply(LIST, function(x){                    #apply FUN; if NA do 1st row
        if (is.na(FUN(x))) {
            x[1, ]
        } else {
            x[FUN(x), ]
        }
    }
)
do.call('rbind', LIST2)                              #put it all back together

哪个收益:

  belongID uniqID Time Rating
1        1    101    5      0
2        2    105    2      5
3        3    108    5      1

修改 有这么多人回答这个问题(解决恕我直言的乐趣),它要求进行微基准测试(Windows 7):

Unit: milliseconds
    expr       min        lq    median        uq      max
1   JIGR  6.356293  6.656752  7.024161  8.697213 179.0884
2 JORRIS  2.932741  3.031416  3.153420  3.552554 246.9604
3  PETER 10.851046 11.459896 12.358939 17.164881 216.7284
4  TYLER  2.864625  2.961667  3.066174  3.413289 221.1569

图表:

enter image description here

答案 1 :(得分:3)

这是一个使用data.table的解决方案,以便于为每个getRecentRow分别过滤和执行我的函数belongID

library(data.table)

# Load the data from the example.
dat = structure(list(belongID = c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), 
          uniqID = 101:108, Time = c(5L, 4L, 4L, 3L, 2L, 4L, 5L, 5L),
          Rating = c(0L, 0L, 0L, 0L, 5L, 2L, 0L, 1L)), 
          .Names = c("belongID", "uniqID", "Time", "Rating"),
          row.names = c(NA, -8L), class = c("data.table", "data.frame"))

dat = data.table(dat) # Convert to data table.

# Function to get the row for a given belongID
getRecentRow <- function(data) {
    # Filter by Rating, then order by time, then select first.
    row = data[Rating != 0][order(-Time)][1]

    if(!is.na(row$uniqID)) {
        # A row was found with Rating != 0, return it.
        return(row)
     } else {
          # The row was blank, so filter again without restricting. rating.
          return(data[order(-Time)][1])
        }  
}

# Run getRecentRow on each chunk of dat with a given belongID
result = dat[,getRecentRow(.SD), by=belongID]

     belongID uniqID Time Rating
[1,]        1    101    5      0
[2,]        2    105    2      5
[3,]        3    108    5      1

答案 2 :(得分:3)

一个建议是:

library(plyr)

maxV <- function(b) {
    if (b[which.max(b$Time), "Rating"]  != 0) {
        return(b[which.max(b$Time), ])
    } else if (!all(b$Rating==0)) {
        bb <- b[order(b$Rating), ]
        return(bb[bb$Rating != 0,][1, ])
    } else {
        return(b[which.max(b$Time),])
    }
}

a <- read.table(textConnection(" belongID   uniqID   Time   Rating  
   1           101       5      0  
   1           102       4      0  
   2           103       4      0  
   2           104       3      0  
   2           105       2      5
   3           106       4      2  
   3           107       5      0  
   3           108       5      1 "), header=T)

ddply(a, .(belongID), maxV)
  belongID uniqID Time Rating
1        1    101    5      0
2        2    105    2      5
3        3    108    5      1

答案 3 :(得分:3)

编辑:

由于速度是您的主要考虑因素,我将我的技巧编辑到您的初始解决方案中,结果如下:

uniqueMax <- function(m, belongID = 1, time = 3) {
  t(
    vapply(
      split(1:nrow(m), m[,belongID]), 
      function(i, x, time){ 
        is.zero <- x[i,'Rating'] == 0
        if(all(is.zero)) is.zero <- FALSE
        x[i, , drop=FALSE][which.max(x[i,time]*(!is.zero)),]
      }
      , m[1,], x=m, time=time
      )
    )
}

我的原始解决方案,比前一个解决方案更容易阅读:

# Get the rating per belongID
getRating <- function(x){
  iszero <- x$Rating == 0
  if(all(iszero)){
    id <- which.max(x$Time)
  } else {
    id <- which.max((!iszero)*x$Time) 
            # This trick guarantees taking 0 into account
  }
  x[id,]
}
# Do this over the complete data frame
do.call(rbind,lapply(split(Data,Data$belongID),getRating)) 
     # edited per Tyler's suggestion

结果:

tc <- textConnection('
belongID   uniqID   Time   Rating  
   1           101       5      0  
   1           102       4      0  
   2           103       4      0  
   2           104       3      0  
   2           105       2      5
   3           106       4      2  
   3           107       5      0  
   3           108       5      1 ')

Data <- read.table(tc,header=TRUE)

do.call(rbind,lapply(split(Data,Data$belongID),getRating)) 

给予:

  belongID uniqID Time Rating
1        1    101    5      0
2        2    105    2      5
3        3    108    5      1

编辑: 只是为了好玩,我在一个包含1000个重复的小数据集上进行了基准测试(使用rbenchmark),并且重复了10个重要的数据集:

结果:

> benchmark(Joris(Data),Tyler(Data),uniqueMax(Data),
+           columns=c("test","elapsed","relative"),
+           replications=1000)
             test elapsed relative
1     Joris(Data)    1.20 1.025641
2     Tyler(Data)    1.42 1.213675
3 uniqueMax(Data)    1.17 1.000000

> benchmark(Joris(Data2),Tyler(Data2),uniqueMax(Data2),
+           columns=c("test","elapsed","relative"),
+           replications=10)
              test elapsed relative
1     Joris(Data2)    3.63 1.174757
2     Tyler(Data2)    4.02 1.300971
3 uniqueMax(Data2)    3.09 1.000000

这里我只是在我们的解决方案中包含了一个函数Joris()和Tyler(),并创建了Data2,如下所示:

Data2 <- data.frame(
  belongID = rep(1:1000,each=10),
  uniqID = 1:10000,
  Time = sample(1:5,10000,TRUE),
  Rating = sample(0:5,10000,TRUE)
  )