R,根据某列的等级选择行

时间:2014-10-18 17:14:21

标签: r

我有一个如下所示的R数据框,

name score
marry 98
marry 77
marry 87
marry 96
mark 99
mark 44
mark 79
john 87
john 77

对于每个名称,我想选择具有最高2分的行,应该是

name score
marry 98
marry 96
mark 99
mark 79
john 87
john 77

有人可以帮忙吗? 非常感谢!

4 个答案:

答案 0 :(得分:5)

这是一种可能的基本方法:

mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
#    name score
# 1 marry    98
# 4 marry    96
# 5  mark    99
# 7  mark    79
# 8  john    87
# 9  john    77

对于好奇的,关于时间的问题 - 这是一个小小的考验......

两个样本数据集,包括1M行,两列,但一个只有1000个可能的“name”值,另一个具有10000个可能的值。

set.seed(1)
df1 <- data.frame(
  name = sample(1000, 1000000, TRUE),
  score = sample(0:100, 1000000, TRUE)
)
df2 <- data.frame(
  name = sample(10000, 1000000, TRUE),
  score = sample(0:100, 1000000, TRUE)
)

基准测试的功能 - 我会在重新安装后尝试添加“dplyr”。

fun1 <- function(mydf) {
  mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
}

fun2 <- function(mydf) {
  as.data.table(mydf)[order(-score), .SD[1:2], by=name]
}

fun3 <- function(mydf) {
  df <- as.data.table(mydf)
  setorder(df, -score)[, head(.SD, 2), by = name]
}

基准测试。

library(microbenchmark)
microbenchmark(fun1(df1), fun2(df1), fun3(df1), 
               fun1(df2), fun2(df2), fun3(df2), times = 20)
# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval
#  fun1(df1)  502.76809  513.98317  569.47883  597.90488  603.34458  686.4302    20
#  fun2(df1)  733.12544  741.18777  796.67106  822.60824  828.88449  839.3837    20
#  fun3(df1)   87.80581   93.07012   95.34281   95.56374   97.49608  101.7991    20
#  fun1(df2)  672.60241  764.10237  764.60365  772.33959  780.14679  799.3505    20
#  fun2(df2) 6338.14881 6360.42621 6407.66675 6412.99278 6451.75626 6479.2681    20
#  fun3(df2)  354.24119  366.47396  382.58666  369.78597  374.01897  468.9197    20

答案 1 :(得分:5)

以下是使用新data.table函数的另一种setorder方法(按引用排序)

library(data.table) # 1.9.4+
setorder(setDT(df), -score)[, head(.SD, 2), by = name]
#     name score
# 1:  mark    99
# 2:  mark    79
# 3: marry    98
# 4: marry    96
# 5:  john    87
# 6:  john    77

答案 2 :(得分:4)

你可以尝试:

 devtools::install_github("hadley/dplyr")
 library(dplyr)


 df %>% 
      group_by(name) %>% 
      arrange(desc(score)) %>%
       slice(1:2)

 #     name score
 #1  john    87
 #2  john    77
 #3  mark    99
 #4  mark    79
 #5 marry    98
 #6 marry    96

或使用data.table

 library(data.table)
 setDT(df)[order(-score), .SD[1:2], by=name]
 #      name score
 #1:  mark    99
 #2:  mark    79
 #3: marry    98
 #4: marry    96
 #5:  john    87
 #6:  john    77

数据

df <- structure(list(name = c("marry", "marry", "marry", "marry", "mark", 
   "mark", "mark", "john", "john"), score = c(98L, 77L, 87L, 96L, 
   99L, 44L, 79L, 87L, 77L)), .Names = c("name", "score"), class = "data.frame", row.names = c(NA, 
  -9L))

基准

@Richard Scriven提供的dplyrbase R方法。创建了两个数据集,1)与@Ananda Mahto的第二个数据集生成相同,2)大50倍。

  • 数据

    set.seed(1) #similar dataset as created by @Ananda Mahto
    dfAM <- data.frame(
                name = sample(10000, 1000000, TRUE),
                score = sample(0:100, 1000000, TRUE)
                )
    
    
    set.seed(1)
    df2 <- data.frame(
                 name = sample(10000, 50*1000000, TRUE),
                 score = sample(0:100, 50*1000000, TRUE)
                )
    
  • 功能

      aMahto <- function(mydf) {mydf[with(mydf, 
                 ave(-score, name, FUN = order)) %in% c(1, 2), ]
               }
    
      akrun1 <- function(mydf) {setDT(mydf)[order(-score), .SD[1:2], by=name] }
      akrun2 <- function(mydf) {setDT(mydf)[order(-score), head(.SD,2), by=name] }
      dArenburg <- function(mydf){ setorder(setDT(mydf), -score)[,
                                                head(.SD,2), by=name]}
      akrun3 <- function(mydf) { mydf %>% group_by(name) %>% 
                                   arrange(desc(score)) %>% slice(1:2) }
    
    
      rScriven1 <- function(mydf) {sapply(split(mydf$score, mydf$name),
                                           function(x) tail(sort(x), 2))}
      rScriven2 <- function(mydf) {stack(lapply(split(mydf$score, mydf$name),
                                            function(x) tail(sort(x), 2)))}
    
  • 基准 分别运行每个数据集的基准。

      library(microbenchmark)
      microbenchmark(aMahto(dfAM), akrun1(dfAM), akrun2(dfAM), akrun3(dfAM),
                    dArenburg(dfAM), rScriven1(dfAM), rScriven2(dfAM), times=20L)
       Unit: milliseconds
             expr        min         lq       mean     median         uq
       aMahto(dfAM)  278.11839  283.82547  293.16843  285.45645  298.77528
       akrun1(dfAM) 2900.86528 2923.94839 2953.00178 2942.73172 2965.85868
       akrun2(dfAM)  189.93849  195.05222  202.30496  196.51019  207.73886
       akrun3(dfAM)   56.75191   57.14967   58.04335   57.32627   57.63047
    dArenburg(dfAM)  161.87583  166.64286  171.67832  168.24355  170.14656
    rScriven1(dfAM)  694.22503  701.85554  717.92201  713.15419  727.84196
    rScriven2(dfAM)  712.84676  728.75839  744.07167  738.74824  759.88312
            max neval
      335.04468    20
     3113.54895    20
      250.09765    20
       66.67198    20
      228.44948    20
      777.78973    20
      797.85850    20
    

在更大的数据集上,@ David Arenburg的方法是Winner

    microbenchmark(aMahto(df2), akrun1(df2), akrun2(df2), akrun3(df2), 
                 dArenburg(df2), rScriven1(df2), rScriven2(df2), times=40L)
    Unit: seconds
            expr       min        lq      mean    median        uq       max
     aMahto(df2) 11.830111 12.027325 12.273881 12.213140 12.533628 13.196659
     akrun1(df2)  6.672874  6.890442  7.018749  6.956716  7.128060  7.542047
     akrun2(df2)  3.794502  3.829567  3.860565  3.847690  3.869065  4.143381
     akrun3(df2)  3.687974  3.725867  3.801861  3.743973  3.933935  4.102295
  dArenburg(df2)  1.531356  1.598570  1.647648  1.618573  1.640258  2.716042
  rScriven1(df2)  6.370144  6.573998  6.685313  6.616246  6.820830  7.118827
  rScriven2(df2)  6.551911  6.628134  6.743644  6.724310  6.867090  7.091750
     neval
      40
      40
      40
      40
      40
      40
      40

答案 3 :(得分:3)

这是一个不同的输出,但是如何这样才能重复这些名称。

sapply(split(df$score, df$name), function(x) tail(sort(x), 2))
#      john mark marry
# [1,]   77   79    96
# [2,]   87   99    98

根据Ananda Mahto的建议,你也可以stack使用lapply

stack(lapply(split(df$score, df$name), function(x) tail(sort(x), 2)))
#   values   ind
# 1     77  john
# 2     87  john
# 3     79  mark
# 4     99  mark
# 5     96 marry
# 6     98 marry