我有一个如下所示的R数据框,
name score
marry 98
marry 77
marry 87
marry 96
mark 99
mark 44
mark 79
john 87
john 77
对于每个名称,我想选择具有最高2分的行,应该是
name score
marry 98
marry 96
mark 99
mark 79
john 87
john 77
有人可以帮忙吗? 非常感谢!
答案 0 :(得分:5)
这是一种可能的基本方法:
mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
# name score
# 1 marry 98
# 4 marry 96
# 5 mark 99
# 7 mark 79
# 8 john 87
# 9 john 77
对于好奇的,关于时间的问题 - 这是一个小小的考验......
两个样本数据集,包括1M行,两列,但一个只有1000个可能的“name”值,另一个具有10000个可能的值。
set.seed(1)
df1 <- data.frame(
name = sample(1000, 1000000, TRUE),
score = sample(0:100, 1000000, TRUE)
)
df2 <- data.frame(
name = sample(10000, 1000000, TRUE),
score = sample(0:100, 1000000, TRUE)
)
基准测试的功能 - 我会在重新安装后尝试添加“dplyr”。
fun1 <- function(mydf) {
mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
}
fun2 <- function(mydf) {
as.data.table(mydf)[order(-score), .SD[1:2], by=name]
}
fun3 <- function(mydf) {
df <- as.data.table(mydf)
setorder(df, -score)[, head(.SD, 2), by = name]
}
基准测试。
library(microbenchmark)
microbenchmark(fun1(df1), fun2(df1), fun3(df1),
fun1(df2), fun2(df2), fun3(df2), times = 20)
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1(df1) 502.76809 513.98317 569.47883 597.90488 603.34458 686.4302 20
# fun2(df1) 733.12544 741.18777 796.67106 822.60824 828.88449 839.3837 20
# fun3(df1) 87.80581 93.07012 95.34281 95.56374 97.49608 101.7991 20
# fun1(df2) 672.60241 764.10237 764.60365 772.33959 780.14679 799.3505 20
# fun2(df2) 6338.14881 6360.42621 6407.66675 6412.99278 6451.75626 6479.2681 20
# fun3(df2) 354.24119 366.47396 382.58666 369.78597 374.01897 468.9197 20
答案 1 :(得分:5)
以下是使用新data.table
函数的另一种setorder
方法(按引用排序)
library(data.table) # 1.9.4+
setorder(setDT(df), -score)[, head(.SD, 2), by = name]
# name score
# 1: mark 99
# 2: mark 79
# 3: marry 98
# 4: marry 96
# 5: john 87
# 6: john 77
答案 2 :(得分:4)
你可以尝试:
devtools::install_github("hadley/dplyr")
library(dplyr)
df %>%
group_by(name) %>%
arrange(desc(score)) %>%
slice(1:2)
# name score
#1 john 87
#2 john 77
#3 mark 99
#4 mark 79
#5 marry 98
#6 marry 96
或使用data.table
library(data.table)
setDT(df)[order(-score), .SD[1:2], by=name]
# name score
#1: mark 99
#2: mark 79
#3: marry 98
#4: marry 96
#5: john 87
#6: john 77
df <- structure(list(name = c("marry", "marry", "marry", "marry", "mark",
"mark", "mark", "john", "john"), score = c(98L, 77L, 87L, 96L,
99L, 44L, 79L, 87L, 77L)), .Names = c("name", "score"), class = "data.frame", row.names = c(NA,
-9L))
@Richard Scriven提供的dplyr
和base R
方法。创建了两个数据集,1)与@Ananda Mahto的第二个数据集生成相同,2)大50倍。
数据
set.seed(1) #similar dataset as created by @Ananda Mahto
dfAM <- data.frame(
name = sample(10000, 1000000, TRUE),
score = sample(0:100, 1000000, TRUE)
)
set.seed(1)
df2 <- data.frame(
name = sample(10000, 50*1000000, TRUE),
score = sample(0:100, 50*1000000, TRUE)
)
功能
aMahto <- function(mydf) {mydf[with(mydf,
ave(-score, name, FUN = order)) %in% c(1, 2), ]
}
akrun1 <- function(mydf) {setDT(mydf)[order(-score), .SD[1:2], by=name] }
akrun2 <- function(mydf) {setDT(mydf)[order(-score), head(.SD,2), by=name] }
dArenburg <- function(mydf){ setorder(setDT(mydf), -score)[,
head(.SD,2), by=name]}
akrun3 <- function(mydf) { mydf %>% group_by(name) %>%
arrange(desc(score)) %>% slice(1:2) }
rScriven1 <- function(mydf) {sapply(split(mydf$score, mydf$name),
function(x) tail(sort(x), 2))}
rScriven2 <- function(mydf) {stack(lapply(split(mydf$score, mydf$name),
function(x) tail(sort(x), 2)))}
基准 分别运行每个数据集的基准。
library(microbenchmark)
microbenchmark(aMahto(dfAM), akrun1(dfAM), akrun2(dfAM), akrun3(dfAM),
dArenburg(dfAM), rScriven1(dfAM), rScriven2(dfAM), times=20L)
Unit: milliseconds
expr min lq mean median uq
aMahto(dfAM) 278.11839 283.82547 293.16843 285.45645 298.77528
akrun1(dfAM) 2900.86528 2923.94839 2953.00178 2942.73172 2965.85868
akrun2(dfAM) 189.93849 195.05222 202.30496 196.51019 207.73886
akrun3(dfAM) 56.75191 57.14967 58.04335 57.32627 57.63047
dArenburg(dfAM) 161.87583 166.64286 171.67832 168.24355 170.14656
rScriven1(dfAM) 694.22503 701.85554 717.92201 713.15419 727.84196
rScriven2(dfAM) 712.84676 728.75839 744.07167 738.74824 759.88312
max neval
335.04468 20
3113.54895 20
250.09765 20
66.67198 20
228.44948 20
777.78973 20
797.85850 20
在更大的数据集上,@ David Arenburg的方法是Winner
。
microbenchmark(aMahto(df2), akrun1(df2), akrun2(df2), akrun3(df2),
dArenburg(df2), rScriven1(df2), rScriven2(df2), times=40L)
Unit: seconds
expr min lq mean median uq max
aMahto(df2) 11.830111 12.027325 12.273881 12.213140 12.533628 13.196659
akrun1(df2) 6.672874 6.890442 7.018749 6.956716 7.128060 7.542047
akrun2(df2) 3.794502 3.829567 3.860565 3.847690 3.869065 4.143381
akrun3(df2) 3.687974 3.725867 3.801861 3.743973 3.933935 4.102295
dArenburg(df2) 1.531356 1.598570 1.647648 1.618573 1.640258 2.716042
rScriven1(df2) 6.370144 6.573998 6.685313 6.616246 6.820830 7.118827
rScriven2(df2) 6.551911 6.628134 6.743644 6.724310 6.867090 7.091750
neval
40
40
40
40
40
40
40
答案 3 :(得分:3)
这是一个不同的输出,但是如何这样才能重复这些名称。
sapply(split(df$score, df$name), function(x) tail(sort(x), 2))
# john mark marry
# [1,] 77 79 96
# [2,] 87 99 98
根据Ananda Mahto的建议,你也可以stack
使用lapply
stack(lapply(split(df$score, df$name), function(x) tail(sort(x), 2)))
# values ind
# 1 77 john
# 2 87 john
# 3 79 mark
# 4 99 mark
# 5 96 marry
# 6 98 marry