根据频率过滤行

时间:2017-10-11 13:24:36

标签: r tidyverse

我们说我们有以下数据:

library(tidyverse)

data <- tibble(
  V1 = c(1, 1, 1, 1, 2, 2, 1, 3),
  V2 = c(1, 1, 1, 2, 2, 2, 1, 3),
  V3 = c(1, 1, 1, 2, 2, 2, 3, 3),
  V4 = c(1, 1, 1, 2, 2, 2, 3, 3)
)

> data
# A tibble: 8 x 4
     V1    V2    V3    V4
  <dbl> <dbl> <dbl> <dbl>
1     1     1     1     1 ## 1st occurrence 
2     1     1     1     1 ## 2nd occurrence
3     1     1     1     1 ## 3rd occurrence
4     1     2     2     2 ## This row does not count while it occurs only once in the data
5     2     2     2     2 ## 1st occurrence
6     2     2     2     2 ## 2nd occurrence
7     1     1     3     3 ## This row does not count while it occurs only once in the data
8     3     3     3     3 ## This row does not count while it occurs only once in the data

我们希望过滤掉比threshold更频繁出现的行;让我们说在我们的例子中,阈值设置为2。此外,未达到阈值的行的值设置为0.因此,结果表应为:

> data_filtered
# A tibble: 8 x 4
     V1    V2    V3    V4
  <dbl> <dbl> <dbl> <dbl>
1     1     1     1     1
2     1     1     1     1
3     1     1     1     1
4     0     0     0     0
5     2     2     2     2
6     2     2     2     2
7     0     0     0     0
8     0     0     0     0

非常感谢任何建议。

2 个答案:

答案 0 :(得分:3)

使用dplyr

的想法
library(dplyr)

a %>% 
 group_by_all() %>% 
 mutate(new = n()) %>% 
 rowwise() %>% 
 mutate_at(vars(-new), funs(replace(., new < 2 , 0))) %>% 
 select(-new) %>% 
 ungroup()

给出,

# A tibble: 8 x 4
     V1    V2    V3    V4
  <dbl> <dbl> <dbl> <dbl>
1     1     1     1     1
2     1     1     1     1
3     1     1     1     1
4     0     0     0     0
5     2     2     2     2
6     2     2     2     2
7     0     0     0     0
8     0     0     0     0

答案 1 :(得分:3)

我会选择data.table

library(data.table)

 data <- data.table(
  V1 = c(1, 1, 1, 1, 2, 2, 1, 3),
  V2 = c(1, 1, 1, 2, 2, 2, 1, 3),
  V3 = c(1, 1, 1, 2, 2, 2, 3, 3),
  V4 = c(1, 1, 1, 2, 2, 2, 3, 3)
)

data[,key:=apply(data,1,function(x) paste0(x,collapse = ""))]#create a unique key per row

setkey(data,key)  #set the "key" (to be used later on) 

data<-merge(data,data[,.N,by=key])#create the frequency N and propagate the values to the initial table via merge

暂时如此:

>data
    key V1 V2 V3 V4 N
1: 1111  1  1  1  1 3
2: 1111  1  1  1  1 3
3: 1111  1  1  1  1 3 
4: 1133  1  1  3  3 1
5: 1222  1  2  2  2 1
6: 2222  2  2  2  2 2
7: 2222  2  2  2  2 2
8: 3333  3  3  3  3 1

 data[,key:=NULL]#drop the key

现在,您可以根据N过滤整个行,通过:

data[N<=2,c("V1","V2","V3","V4"):=0]#set all columns to 0 if N is less or equal to 2

导致:

   V1 V2 V3 V4 N
1:  1  1  1  1 3
2:  1  1  1  1 3
3:  1  1  1  1 3
4:  0  0  0  0 1
5:  0  0  0  0 1
6:  2  2  2  2 2
7:  2  2  2  2 2
8:  0  0  0  0 1

当然,您可以通过N

立即放弃data[,N:=NULL]