从df中选择根据其值创建子组(逐个)的行

时间:2018-03-26 06:21:28

标签: r dataframe filter group-by dplyr

我的数据仍然存在障碍。这是可重复的df:

signal1 <- c(rep(1:6))
signal2 <- c(rep(7:12))
signal3 <- c(rep(13:18))
signal4 <- c(rep(19:24))
val <- c(2.5,3.2,2.9,0.1,0.4,4.1)
tag <- c('str1','str2','str3','str4','str5','str6')
gene <- c('ABC','ABC','ABC','DEF','DEF','DEF')
df <- data.frame(signal1,signal2,signal3,signal4,gene,FC)

  signal1 signal2 signal3 signal4 gene val
1       1       7      13      19  ABC 2.5
2       2       8      14      20  ABC 3.2
3       3       9      15      21  ABC 2.9
4       4      10      16      22  DEF 0.1
5       5      11      17      23  DEF 0.4
6       6      12      18      24  DEF 4.1

示例I

我想选择基于val值的条纹,系列(2个或更多)的行,而不是组2.5中的gene。问题是行应该是一个一个,所以期望的输出应该是:

  signal1 signal2 signal3 signal4 gene val
1       1       7      13      19  ABC 2.5
2       2       8      14      20  ABC 3.2
3       3       9      15      21  ABC 2.9

小组ABC中的三行符合条件 - 系列长度 - 3,一个接一个,所有这些都有val >= 2.5

示例II

对于数据集:

  signal1 signal2 signal3 signal4 gene val
1       1       7      13      19  ABC 2.5
2       2       8      14      20  ABC 0.2
3       3       9      15      21  ABC 2.9
4       4      10      16      22  DEF 0.1
5       5      11      17      23  DEF 0.4
6       6      12      18      24  DEF 4.1

结果,为空df,因为组中的所有行都没有条纹。

示例III

  signal1 signal2 signal3 signal4 gene val
1       1       7      13      19  ABC 0.5
2       2       8      14      20  ABC 3.2
3       3       9      15      21  ABC 2.9
4       4      10      16      22  DEF 7.1
5       5      11      17      23  DEF 4.4
6       6      12      18      24  DEF 2.1

输出:

  signal1 signal2 signal3 signal4 gene val
2       2       8      14      20  ABC 3.2
3       3       9      15      21  ABC 2.9
4       4      10      16      22  DEF 7.1
5       5      11      17      23  DEF 4.4

两组/条纹/一系列行与val >= 2.5

一个接一个

示例IV

让我们采取更大的数据集:

   signal1 signal2 signal3 signal4 gene  val
1        1      11      21      31  ABC  0.5
2        2      12      22      32  ABC  3.2
3        3      13      23      33  ABC  2.9
4        4      14      24      34  ABC  7.1
5        5      15      25      35  ABC  0.4
6        6      16      26      36  DEF  4.1
7        7      17      27      37  DEF  6.2
8        8      18      28      38  DEF  0.2
9        9      19      29      39  DEF  3.2
10      10      20      30      40  DEF 12.1

输出:

   signal1 signal2 signal3 signal4 gene  val
2        2      12      22      32  ABC  3.2
3        3      13      23      33  ABC  2.9
4        4      14      24      34  ABC  7.1
6        6      16      26      36  DEF  4.1
7        7      17      27      37  DEF  6.2
9        9      19      29      39  DEF  3.2
10      10      20      30      40  DEF 12.1

我希望你能看到我在寻找什么。

我尝试使用dplyr执行某些操作:

df %>%
  group_by(gene) %>%
  group_by(val >= 2.5)

来自例II的数据的结果:

# A tibble: 6 x 7
# Groups:   FC >= 2.5 [2]
  signal1 signal2 signal3 signal4 gene     FC `FC >= 2.5`
    <int>   <int>   <int>   <int> <fct> <dbl> <lgl>      
1       1       7      13      19 ABC   2.50  T          
2       2       8      14      20 ABC   2.40  F          
3       3       9      15      21 ABC   2.90  T          
4       4      10      16      22 DEF   0.100 F          
5       5      11      17      23 DEF   0.400 F          
6       6      12      18      24 DEF   4.10  T

现在至少在两次出现时逐一选择我们T的行。在这种情况下,我们没有这种情况......

我将非常感谢你的帮助。

修改

akrun提出的答案可以解决问题: 对于数据集:

   signal1 signal2 signal3 signal4 gene  val
1        1      11      21      31  ABC  0.5
2        2      12      22      32  ABC  3.2
3        3      13      23      33  ABC  0.9
4        4      14      24      34  ABC  7.1
5        5      15      25      35  ABC  0.4
6        6      16      26      36  DEF  4.1
7        7      17      27      37  DEF  6.2
8        8      18      28      38  DEF  0.2
9        9      19      29      39  DEF  0.2
10      10      20      30      40  DEF 12.1

我希望只有两行DEF编号为6和7。

我们有:

# A tibble: 2 x 6
  signal1 signal2 signal3 signal4 gene    val
    <int>   <int>   <int>   <int> <fct> <dbl>
1       6      16      26      36 DEF    4.10
2       7      17      27      37 DEF    6.20

效果很好!

编辑#2:

不幸的是我发现了小虫:

对于数据:

   signal1 signal2 signal3 signal4 gene  val
1        1      11      21      31  ABC  0.5
2        2      12      22      32  ABC  3.2
3        3      13      23      33  ABC  7.9
4        4      14      24      34  DEF  8.1
5        5      15      25      35  DEF  0.4
6        6      16      26      36  DEF  4.1
7        7      17      27      37  GHI  6.0
8        8      18      28      38  GHI  0.2
9        9      19      29      39  GHI  8.2
10      10      20      30      40  JKL 12.1

只应返回第2行和第3行,然后返回:

f1(df, gene, val)

我们有:

# A tibble: 6 x 6
  signal1 signal2 signal3 signal4 gene    val
    <int>   <int>   <int>   <int> <fct> <dbl>
1       2      12      22      32 ABC    3.20
2       3      13      23      33 ABC    7.90
3       4      14      24      34 DEF    8.10
4       6      16      26      36 DEF    4.10
5       7      17      27      37 GHI    6.00
6       9      19      29      39 GHI    8.20

然而你的fisrt代码:

df %>% 
  group_by(gene, grp = rleid(val >= 2.5)) %>%
  filter(val >= 2.5, n() > 1) %>%
  ungroup %>%
  select(-grp)

返回:

# A tibble: 2 x 6
  signal1 signal2 signal3 signal4 gene    val
    <int>   <int>   <int>   <int> <fct> <dbl>
1       2      12      22      32 ABC    3.20
2       3      13      23      33 ABC    7.90

我认为tidyverse屏蔽了dplyr个函数,并且在R中重新启动会话之后:

数据集:

signal1 <- c(rep(1:10))
signal2 <- c(rep(11:20))
signal3 <- c(rep(21:30))
signal4 <- c(rep(31:40))
val <- c(0.5,3.2,7.9,8.1,4.4,0.1,6.0,0.2,8.2,12.1)
tag <- c('str1','str2','str3','str4','str5','str6','str7','str8','str9','str10')
gene <- c('ABC','ABC','ABC','DEF','DEF','DEF','GHI','GHI','GHI','JKL')
df <- data.frame(signal1,signal2,signal3,signal4,gene,val)
df
   signal1 signal2 signal3 signal4 gene  val
1        1      11      21      31  ABC  0.5
2        2      12      22      32  ABC  3.2
3        3      13      23      33  ABC  7.9
4        4      14      24      34  DEF  8.1
5        5      15      25      35  DEF  4.4
6        6      16      26      36  DEF  0.1
7        7      17      27      37  GHI  6.0
8        8      18      28      38  GHI  0.2
9        9      19      29      39  GHI  8.2
10      10      20      30      40  JKL 12.1

获得的Restult:

df %>% 
  group_by(gene, grp = rleid(val >= 2.5)) %>%
  filter(val >= 2.5, n() > 1) %>%
  ungroup %>%
  select(-grp

CORRECT

# A tibble: 4 x 6
  signal1 signal2 signal3 signal4 gene    val
    <int>   <int>   <int>   <int> <fct> <dbl>
1       2      12      22      32 ABC    3.20
2       3      13      23      33 ABC    7.90
3       4      14      24      34 DEF    8.10
4       5      15      25      35 DEF    4.40

通过功能获得的结果:

f1 <- function(dat, grp1, grp2) {
  grp1 <- dplyr::enquo(grp1)
  grp2 <- dplyr::enquo(grp2)
  dat %>%
    dplyr::group_by(!! grp1) %>%
    dplyr::group_by(grp = data.table::rleid(!!(grp2) >= 2.5), add = TRUE) %>%
    dplyr::filter(val >= 2.5, n() > 1) %>%
    ungroup %>%
    dplyr::select(-grp)
}

# A tibble: 6 x 6
  signal1 signal2 signal3 signal4 gene    val
    <int>   <int>   <int>   <int> <fct> <dbl>
1       2      12      22      32 ABC    3.20
2       3      13      23      33 ABC    7.90
3       4      14      24      34 DEF    8.10
4       5      15      25      35 DEF    4.40
5       7      17      27      37 GHI    6.00
6       9      19      29      39 GHI    8.20

不幸的是,它不正确,GHI中的一行没有任何条纹......

1 个答案:

答案 0 :(得分:2)

基于这些示例,我们创建了一个函数来执行filter ing

library(data.table)
library(dplyr)

f1 <- function(dat, grp1, grp2) {
     grp1 <- enquo(grp1)
     grp2 <- enquo(grp2)
     dat %>%
        group_by(!! grp1) %>%
        group_by(grp = rleid(!!(grp2) >= 2.5), add = TRUE) %>%
        filter(val >= 2.5, n() > 1) %>%
        ungroup %>%
        select(-grp)
   }        

-example I

f1(df1, gene, val)
# A tibble: 3 x 6
#  signal1 signal2 signal3 signal4 gene    val
#    <int>   <int>   <int>   <int> <chr> <dbl>
#1       1       7      13      19 ABC    2.50
#2       2       8      14      20 ABC    3.20
#3       3       9      15      21 ABC    2.90

-example II

f1(df2, gene, val)
# A tibble: 0 x 6
# ... with 6 variables: signal1 <int>, signal2 <int>, signal3 <int>, signal4 <int>, gene <chr>, val <dbl>

-example III

f1(df3, gene, val)
# A tibble: 4 x 6
#  signal1 signal2 signal3 signal4 gene    val
#    <int>   <int>   <int>   <int> <chr> <dbl>
#1       2       8      14      20 ABC    3.20
#2       3       9      15      21 ABC    2.90
#3       4      10      16      22 DEF    7.10
#4       5      11      17      23 DEF    4.40

-example IV

f1(df4, gene, val)
# A tibble: 7 x 6
# Groups: gene [2]
#  signal1 signal2 signal3 signal4 gene    val
#    <int>   <int>   <int>   <int> <chr> <dbl>
#1       2      12      22      32 ABC    3.20
#2       3      13      23      33 ABC    2.90
#3       4      14      24      34 ABC    7.10
#4       6      16      26      36 DEF    4.10
#5       7      17      27      37 DEF    6.20
#6       9      19      29      39 DEF    3.20
#7      10      20      30      40 DEF   12.1 

-example V

f1(df5, gene, val)
# A tibble: 2 x 6
#  signal1 signal2 signal3 signal4 gene    val
#    <int>   <int>   <int>   <int> <chr> <dbl>
#1       6      16      26      36 DEF    4.10
#2       7      17      27      37 DEF    6.20

-example VI

f1(df6, gene, val)
# A tibble: 2 x 6
#  signal1 signal2 signal3 signal4 gene    val
#    <int>   <int>   <int>   <int> <chr> <dbl>
#1       2      12      22      32 ABC    3.20
#2       3      13      23      33 ABC    7.90