我们说我们有以下数据:
library(tidyverse)
data <- tibble(
V1 = c(1, 1, 1, 1, 2, 2, 1, 3),
V2 = c(1, 1, 1, 2, 2, 2, 1, 3),
V3 = c(1, 1, 1, 2, 2, 2, 3, 3),
V4 = c(1, 1, 1, 2, 2, 2, 3, 3)
)
> data
# A tibble: 8 x 4
V1 V2 V3 V4
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 ## 1st occurrence
2 1 1 1 1 ## 2nd occurrence
3 1 1 1 1 ## 3rd occurrence
4 1 2 2 2 ## This row does not count while it occurs only once in the data
5 2 2 2 2 ## 1st occurrence
6 2 2 2 2 ## 2nd occurrence
7 1 1 3 3 ## This row does not count while it occurs only once in the data
8 3 3 3 3 ## This row does not count while it occurs only once in the data
我们希望过滤掉比threshold
更频繁出现的行;让我们说在我们的例子中,阈值设置为2。此外,未达到阈值的行的值设置为0.因此,结果表应为:
> data_filtered
# A tibble: 8 x 4
V1 V2 V3 V4
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1
2 1 1 1 1
3 1 1 1 1
4 0 0 0 0
5 2 2 2 2
6 2 2 2 2
7 0 0 0 0
8 0 0 0 0
非常感谢任何建议。
答案 0 :(得分:3)
使用dplyr
,
library(dplyr)
a %>%
group_by_all() %>%
mutate(new = n()) %>%
rowwise() %>%
mutate_at(vars(-new), funs(replace(., new < 2 , 0))) %>%
select(-new) %>%
ungroup()
给出,
# A tibble: 8 x 4 V1 V2 V3 V4 <dbl> <dbl> <dbl> <dbl> 1 1 1 1 1 2 1 1 1 1 3 1 1 1 1 4 0 0 0 0 5 2 2 2 2 6 2 2 2 2 7 0 0 0 0 8 0 0 0 0
答案 1 :(得分:3)
我会选择data.table
:
library(data.table)
data <- data.table(
V1 = c(1, 1, 1, 1, 2, 2, 1, 3),
V2 = c(1, 1, 1, 2, 2, 2, 1, 3),
V3 = c(1, 1, 1, 2, 2, 2, 3, 3),
V4 = c(1, 1, 1, 2, 2, 2, 3, 3)
)
data[,key:=apply(data,1,function(x) paste0(x,collapse = ""))]#create a unique key per row
setkey(data,key) #set the "key" (to be used later on)
data<-merge(data,data[,.N,by=key])#create the frequency N and propagate the values to the initial table via merge
暂时如此:
>data
key V1 V2 V3 V4 N
1: 1111 1 1 1 1 3
2: 1111 1 1 1 1 3
3: 1111 1 1 1 1 3
4: 1133 1 1 3 3 1
5: 1222 1 2 2 2 1
6: 2222 2 2 2 2 2
7: 2222 2 2 2 2 2
8: 3333 3 3 3 3 1
data[,key:=NULL]#drop the key
现在,您可以根据N
过滤整个行,通过:
data[N<=2,c("V1","V2","V3","V4"):=0]#set all columns to 0 if N is less or equal to 2
导致:
V1 V2 V3 V4 N
1: 1 1 1 1 3
2: 1 1 1 1 3
3: 1 1 1 1 3
4: 0 0 0 0 1
5: 0 0 0 0 1
6: 2 2 2 2 2
7: 2 2 2 2 2
8: 0 0 0 0 1
当然,您可以通过N
data[,N:=NULL]