我从大数据框中取样。对于每个ID ,我需要来自每个类别的前n 因子。我已经尝试过使用嵌套for循环,但我认为它并不是有效的方式。请在下面找到数据和我的尝试。
id = c(1,1,1,1,1,1,2,2,2,2,2,2,2)
category = c("A","A","A","B","B","B","A","A","A","A","B","B","B")
factor= c("A1","A2","A3","B1","B2","B3","A2","A1","A3","A4","B2","B1","B3")
rank = c(2,1,5,2,1,6,12,10,8,9,23,12,10)
df = data.frame(id,category,factor,rank)
df$rank <- as.integer(df$rank)
cat <- as.data.frame(table(df$category))
list1 <- list()
list2 <- list()
for(i in 1:length(unique(df$id))){
for(j in 1:nrow(cat))
{
list1[[j]] <- df[order(rank),][id==df[,id[i]]&category==cat[[1]][j] ,][1:2]
}
rbind_list1 <- rbindlist(list1)
list2[[i]] <- rbind_list1[rowSums(is.na(rbind_list1)) != ncol(rbind_list1),]
}
final <- do.call(rbind, list2)
df <- final[rowSums(is.na(final)) != ncol(final),]
DESIRED OUTPUT:
id category factor rank
1 A A2 1
1 A A1 2
1 B B1 2
1 B B2 1
2 A A3 8
2 A A4 9
2 B B3 10
2 B B1 12
答案 0 :(得分:5)
使用基数R,你可以这样做:
topn <- function(df, n) do.call(rbind, by(df, list(id, category), function(x) x[order(x$rank), ][1:n, ]))
topn(df, 2)
id category factor rank
2 1 A A2 1
1 1 A A1 2
9 2 A A3 8
10 2 A A4 9
5 1 B B2 1
4 1 B B1 2
13 2 B B3 10
12 2 B B1 12
答案 1 :(得分:2)
使用data.table,您只需执行
library(data.table)
n <- 2
df <- setDT(df)
df[order(rank),.SD[1:n], by = .(id,category)]
1: 1 A A2 1
2: 1 A A1 2
3: 1 B B2 1
4: 1 B B1 2
5: 2 A A3 8
6: 2 A A4 9
7: 2 B B3 10
8: 2 B B1 12
或者如果你想加快速度
df[df[order(rank),.I[1:n], by = .(id,category)]$V1]
我不认真对待dplyr,让别人找到它