如何在数据帧中找到单调的序列

时间:2017-04-30 08:34:49

标签: r data.table

我在R中说有一个数据帧df

name <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o")
score <- c(42, 45, 47, 32,49,50, 51, 28, 54, 55, 56, 12, 13, 61, 64)
class <- c("c1", "c1", "c1", "c1","c1", "c2", "c2", "c2", "c2", "c2", "c3", "c3","c3", "c3", "c3")
df <- data.table(name, score, class)

看起来像:

df
     name score class
 1:    a    42    c1
 2:    b    45    c1
 3:    c    47    c1
 4:    d    32    c1
 5:    e    49    c1
 6:    f    50    c2
 7:    g    51    c2
 8:    h    28    c2
 9:    i    54    c2
10:    j    55    c2
11:    k    56    c3
12:    l    12    c3
13:    m    13    c3
14:    n    61    c3
15:    o    64    c3

我只需要那些遵循每个班级单调的分数顺序的记录。在这种情况下,只有c1类得分为42,45,47,49的记录,c2类得分为50,51,54,55的记录,c3类得分为56,61,64的记录。我想删除那些记录中提到的分数不是c1,c2,c3类的每个记录的记录。总共有100万条记录。

2 个答案:

答案 0 :(得分:1)

我们可以使用diff

df[c(TRUE, diff(score)>0)]
#    name score
#1:    a    42
#2:    b    45
#3:    c    47
#4:    e    50
#5:    f    51

更新

对于OP的帖子中的更新示例

df[df[, .I[score == cummax(score)], class]$V1]
#    name score class
# 1:    a    42    c1
# 2:    b    45    c1
# 3:    c    47    c1
# 4:    e    49    c1
# 5:    f    50    c2
# 6:    g    51    c2
# 7:    i    54    c2
# 8:    j    55    c2
# 9:    k    56    c3
#10:    n    61    c3
#11:    o    64    c3

同样的想法可以与base R

一起使用
df[with(df, ave(score, class, FUN = cummax)==score),]
#     name score class
# 1:    a    42    c1
# 2:    b    45    c1
# 3:    c    47    c1
# 4:    e    49    c1
# 5:    f    50    c2
# 6:    g    51    c2
# 7:    i    54    c2
# 8:    j    55    c2
# 9:    k    56    c3
#10:    n    61    c3
#11:    o    64    c3

dplyr

library(dplyr)
df %>%
   group_by(class) %>%
   filter(score == cummax(score))
#    name score class
#   <chr> <dbl> <chr>
#1      a    42    c1
#2      b    45    c1
#3      c    47    c1
#4      e    49    c1
#5      f    50    c2
#6      g    51    c2
#7      i    54    c2
#8      j    55    c2
#9      k    56    c3
#10     n    61    c3
#11     o    64    c3

答案 1 :(得分:1)

另一种选择是使用shift - 函数:

df[shift(score, fill = 0) < score]

给出:

   name score
1:    a    42
2:    b    45
3:    c    47
4:    e    50
5:    f    51
# create a larger dataset
dt <- data.table(name = sample(letters, 1e6, TRUE), score = sample(20:60, 1e6, TRUE))

较大数据集的基本基准:

> system.time(dt[shift(score, fill = 0) < score])
   user  system elapsed 
  0.008   0.002   0.009 
> system.time(dt[dt[, .I[c(TRUE, diff(score)>0)]]])
   user  system elapsed 
  0.027   0.008   0.035 
> system.time(dt[c(TRUE, diff(score)>0)])
   user  system elapsed 
  0.020   0.003   0.023 

作为@ akrun更新更新示例的替代方案,您还可以执行以下操作:

df[, .SD[score == cummax(score)], class]

或与基础R:

df[!!with(df, ave(score, class, FUN = function(x) x == cummax(x))), ]

再次成为基准:

# create a larger dataset
dt <- data.table(name = sample(letters, 3e6, TRUE), 
                 score = sample(20:60, 3e6, TRUE), 
                 class = rep(c('c1','c2','c3'), each = 1e6))

# the benchmark

> system.time(dt[, .SD[score == cummax(score)], class])
   user  system elapsed 
  0.030   0.007   0.037 
> system.time(dt[dt[, .I[score == cummax(score)], class]$V1])
   user  system elapsed 
  0.028   0.007   0.035 
> system.time(dt[!!with(dt, ave(score, class, FUN = function(x) x == cummax(x))), ])
   user  system elapsed 
  0.158   0.076   0.236