如何通过在R中有效地过滤和分组来对数据进行子集化

时间:2018-06-08 22:13:26

标签: r dplyr data.table

我正在开展一个项目,正在寻找一些帮助,以使我的代码更高效地运行。我已经搜索过类似的问题,但似乎找不到像这个一样细致的东西。我提出的解决方案非常笨重。我相信,使用dplyrdata.tables等软件包必须有更有效的方法来执行此操作。

问题:我有3列数据,'ids''x.group''times'。我需要为每个'ids'提取每个'times'块中显示的前3个唯一'x.group'

但是,我不想包含任何'ids''x.group'等于“0”。我的代码底部的输出产生了正确的值,但在我看来,这是一种相当尴尬的方式。

注意:在下面的代码示例中,我使用x.groups = ['A','B','0'],但在我的实际项目中,这些可以采用许多值,因此它们不会总是'A'或'B',而是' 0将永远存在(例如,我可能有['A','K','0']['M','W','0']等。您可以在本文的底部找到示例数据集。

# find x.groups
xs <- unique(myDF$x.group)[unique(myDF$x.group) != "0"]

# DF without '0's as x.group entries
ps <- unique(myDF[which(myDF$x.group %in% xs) , c("ids","x.group","time")])

first3.x1.t1 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "1", ]$ids[1:3]
first3.x2.t1 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "1", ]$ids[1:3]
first3.x1.t2 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "2", ]$ids[1:3]
first3.x2.t2 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "2", ]$ids[1:3]
first3.x1.t3 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "3", ]$ids[1:3]
first3.x2.t3 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "3", ]$ids[1:3]

# First 3 unique ids from time block 1 for each x.group
> first3.x1.t1; first3.x2.t1;
[1] "2"  "17" "11"
[1] "5"  "10" "4"

# First 3 unique ids from time block 2 for each x.group
> first3.x1.t2; first3.x2.t2;
[1] "9"  "6"  "16"
[1] "8"  "13" "7" 

# First 3 unique ids from time block 3 for each x.group
> first3.x1.t3; first3.x2.t3;
[1] "11" "2"  "10"
[1] "1"  "3"  "13"

数据:

# create data frame
ids <- c("2","0","15","5","17","10","4","2","3","11","11","18","10","8","13","9","6","16","7","14",
     "16","7","11","12","14","5","1","11","3","2","10","17","3","13","10","17","2","10","16","10")
x.group <- c("A","A","0","B","A","B","B","A","B","A","A","0","B","B","B","A","A","A","B","B",
         "A","A","0","B","A","B","B","A","B","A","A","0","B","B","B","A","A","A","B","B")
time <- c(rep("1",13), rep("2",13), rep("3",14))

myDF <- as.data.frame(cbind(ids, x.group, time), stringsAsFactors = FALSE)
> myDF
   ids x.group time
1    2       A    1
2    0       A    1
3   15       0    1
4    5       B    1
5   17       A    1
6   10       B    1
7    4       B    1
8    2       A    1
9    3       B    1
10  11       A    1
11  11       A    1
12  18       0    1
13  10       B    1
14   8       B    2
15  13       B    2
16   9       A    2
17   6       A    2
18  16       A    2
19   7       B    2
20  14       B    2
21  16       A    2
22   7       A    2
23  11       0    2
24  12       B    2
25  14       A    2
26   5       B    2
27   1       B    3
28  11       A    3
29   3       B    3
30   2       A    3
31  10       A    3
32  17       0    3
33   3       B    3
34  13       B    3
35  10       B    3
36  17       A    3
37   2       A    3
38  10       A    3
39  16       B    3
40  10       B    3

3 个答案:

答案 0 :(得分:2)

aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)
  x.group time ids.1 ids.2 ids.3
1       A    1     2    17    11
2       B    1     5    10     4
3       A    2     9     6    16
4       B    2     8    13     7
5       A    3    11     2    10
6       B    3     1     3    13

这返回了一个嵌套的数据帧。您可以将其删除为:

a=aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)
b=do.call(data.frame,a)#The unnested dataframe:
b
  x.group time ids.1 ids.2 ids.3
1       A    1     2    17    11
2       B    1     5    10     4
3       A    2     9     6    16
4       B    2     8    13     7
5       A    3    11     2    10
6       B    3     1     3    13

答案 1 :(得分:1)

library(dplyr)

myDF %>% 
  distinct() %>% 
  filter(x.group != "0" & ids != 0) %>% 
  group_by(x.group, time) %>% 
  slice(1:3)

# # A tibble: 18 x 3
# # Groups: x.group, time [6]
#    ids   x.group time 
#    <chr> <chr>   <chr>
#  1 2     A       1    
#  2 17    A       1    
#  3 11    A       1    
#  4 9     A       2    
#  5 6     A       2    
#  6 16    A       2    
#  7 11    A       3    
#  8 2     A       3    
#  9 10    A       3    
# 10 5     B       1    
# 11 10    B       1    
# 12 4     B       1    
# 13 8     B       2    
# 14 13    B       2    
# 15 7     B       2    
# 16 1     B       3    
# 17 3     B       3    
# 18 13    B       3 

答案 2 :(得分:1)

这是我认为应该是最快的data.table解决方案;通过避免为每个组调用.SD可以更快。

library(data.table)
unique(setDT(myDF))[ids != 0 & x.group!=0, head(.SD, 3), by = list(time, x.group)]


# time x.group ids
# 1:    1       A   2
# 2:    1       A  17
# 3:    1       A  11
# 4:    1       B   5
# 5:    1       B  10
# 6:    1       B   4
# 7:    2       B   8
# 8:    2       B  13
# 9:    2       B   7
# 10:    2       A   9
# 11:    2       A   6
# 12:    2       A  16
# 13:    3       B   1
# 14:    3       B   3
# 15:    3       B  13
# 16:    3       A  11
# 17:    3       A   2
# 18:    3       A  10

microbenchmark(dplyr= {myDF %>% 
    distinct() %>% 
    filter(x.group != "0" & ids != 0) %>% 
    group_by(x.group, time) %>% 
    slice(1:3)},
    aggreagte ={aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)},
    data.table={unique(setDT(myDF))[ids != 0 & x.group!=0, head(.SD, 3), by = list(time, x.group)]})

# Unit: microseconds
# expr      min        lq      mean    median       uq       max neval
# dplyr 6696.740 7025.1780 7911.2968 7229.2430 7500.627 35545.183   100
# aggreagte  920.410  981.9920 1090.5363 1041.1590 1132.627  2801.076   100
# data.table  825.925  894.6005  979.3326  961.3135 1052.329  1267.865   100