如何在dplyr

时间:2019-07-19 07:38:19

标签: r dplyr tidyverse

我有以下使用group_split of dplyr的过程:

library(tidyverse)
set.seed(1)
iris %>% sample_n(size = 5) %>% 
    group_by(Species) %>% 
    group_split()

结果是:

[[1]]
# A tibble: 2 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
         <dbl>       <dbl>        <dbl>       <dbl> <fct>  
1          5           3.5          1.6         0.6 setosa 
2          5.1         3.8          1.5         0.3 setosa 

[[2]]
# A tibble: 2 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
         <dbl>       <dbl>        <dbl>       <dbl> <fct>     
1          5.9         3            4.2         1.5 versicolor
2          6.2         2.2          4.5         1.5 versicolor

[[3]]
# A tibble: 1 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
         <dbl>       <dbl>        <dbl>       <dbl> <fct>    
1          6.2         3.4          5.4         2.3 virginica

我要实现的是按分组名称(即物种)命名此列表。 产生(手工完成):

$setosa
# A tibble: 2 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
         <dbl>       <dbl>        <dbl>       <dbl> <fct>  
1          5           3.5          1.6         0.6 setosa 
2          5.1         3.8          1.5         0.3 setosa 

$versicolor
# A tibble: 2 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
         <dbl>       <dbl>        <dbl>       <dbl> <fct>     
1          5.9         3            4.2         1.5 versicolor
2          6.2         2.2          4.5         1.5 versicolor

$virginica
# A tibble: 1 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
         <dbl>       <dbl>        <dbl>       <dbl> <fct>    
1          6.2         3.4          5.4         2.3 virginica

我该如何实现?

更新

我尝试了这个新数据,现在的命名为Cluster

df <- structure(list(Cluster = c("Cluster9", "Cluster11", "Cluster1", 
"Cluster9", "Cluster6", "Cluster12", "Cluster9", "Cluster11", 
"Cluster8", "Cluster8"), gene_name = c("Tbc1d8", "Vimp", "Grhpr", 
"H1f0", "Zfp398", "Pikfyve", "Ankrd13a", "Fgfr1op2", "Golga7", 
"Lars2"), p_value = c(3.46629097620496e-47, 3.16837338947245e-62, 
1.55108439059684e-06, 9.46078511685542e-131, 0.000354049720507017, 
0.0146807415917158, 1.42799750295289e-38, 2.0697825959399e-08, 
4.13777221466668e-06, 3.92889640704683e-184), morans_test_statistic = c(14.3797687352223, 
16.6057085487911, 4.66393667525872, 24.301453902967, 3.38642377758137, 
2.17859882998961, 12.9350063459509, 5.48479186018979, 4.4579286289179, 
28.9144540271157), morans_I = c(0.0814728893885783, 0.0947505609609695, 
0.0260671534007409, 0.138921824574569, 0.018764800166045, 0.0119813199210325, 
0.0736554862590782, 0.0309849638728409, 0.0250591347318986, 0.165310420808725
), q_value = c(1.57917584337356e-46, 1.62106594498462e-61, 3.43312171446844e-06, 
6.99503520654745e-130, 0.000683559649593623, 0.0245476826213791, 
5.96116678335584e-38, 4.97603701391971e-08, 8.9649490080526e-06, 
3.48152096326702e-183)), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame"))

使用罗纳克·沙(Ronak Shah)的方法时,结果不一致:

df %>% group_split(Cluster) %>% setNames(unique(df$Cluster))
$Cluster9
# A tibble: 1 x 6
  Cluster  gene_name    p_value morans_test_statistic morans_I    q_value
  <chr>    <chr>          <dbl>                 <dbl>    <dbl>      <dbl>
1 Cluster1 Grhpr     0.00000155                  4.66   0.0261 0.00000343

$Cluster11
# A tibble: 2 x 6
  Cluster   gene_name  p_value morans_test_statistic morans_I  q_value
  <chr>     <chr>        <dbl>                 <dbl>    <dbl>    <dbl>
1 Cluster11 Vimp      3.17e-62                 16.6    0.0948 1.62e-61
2 Cluster11 Fgfr1op2  2.07e- 8                  5.48   0.0310 4.98e- 8

$Cluster1
# A tibble: 1 x 6
  Cluster   gene_name p_value morans_test_statistic morans_I q_value
  <chr>     <chr>       <dbl>                 <dbl>    <dbl>   <dbl>
1 Cluster12 Pikfyve    0.0147                  2.18   0.0120  0.0245

$Cluster6
# A tibble: 1 x 6
  Cluster  gene_name  p_value morans_test_statistic morans_I  q_value
  <chr>    <chr>        <dbl>                 <dbl>    <dbl>    <dbl>
1 Cluster6 Zfp398    0.000354                  3.39   0.0188 0.000684

$Cluster12
# A tibble: 2 x 6
  Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
  <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
1 Cluster8 Golga7    4.14e-  6                  4.46   0.0251 8.96e-  6
2 Cluster8 Lars2     3.93e-184                 28.9    0.165  3.48e-183

$Cluster8
# A tibble: 3 x 6
  Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
  <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
1 Cluster9 Tbc1d8    3.47e- 47                  14.4   0.0815 1.58e- 46
2 Cluster9 H1f0      9.46e-131                  24.3   0.139  7.00e-130
3 Cluster9 Ankrd13a  1.43e- 38                  12.9   0.0737 5.96e- 38

请注意,$Cluster9中包含Cluster1

请建议如何处理?

7 个答案:

答案 0 :(得分:4)

我遇到了同样的问题,并使用了以下两步解决方案:

df= df %>% group_by(Cluster)
df= df %>% group_split() %>% set_names(unlist(group_keys(df)))     
df$Cluster1
# A tibble: 1 x 6
  Cluster  gene_name    p_value morans_test_statistic morans_I    q_value
  <chr>    <chr>          <dbl>                 <dbl>    <dbl>      <dbl>
1 Cluster1 Grhpr     0.00000155                  4.66   0.0261 0.00000343
df$Cluster9
# A tibble: 3 x 6
  Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
  <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
1 Cluster9 Tbc1d8    3.47e- 47                  14.4   0.0815 1.58e- 46
2 Cluster9 H1f0      9.46e-131                  24.3   0.139  7.00e-130
3 Cluster9 Ankrd13a  1.43e- 38                  12.9   0.0737 5.96e- 38

答案 1 :(得分:4)

很多好的答案。你也可以这样做:

iris %>% sample_n(size = 5) %>% 
  split(f = as.factor(.$Species))

哪个会给你:

$setosa
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
4          5.5         3.5          1.3         0.2  setosa
5          5.3         3.7          1.5         0.2  setosa

$versicolor
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
3            5         2.3          3.3           1 versicolor

$virginica
  Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
1          7.7         2.6          6.9         2.3 virginica
2          7.2         3.0          5.8         1.6 virginica

也适用于上面的数据框:

df %>% 
  split(f = as.factor(.$Cluster))

给你:

$Cluster1
# A tibble: 1 x 6
  Cluster  gene_name    p_value morans_test_statistic morans_I    q_value
  <chr>    <chr>          <dbl>                 <dbl>    <dbl>      <dbl>
1 Cluster1 Grhpr     0.00000155                  4.66   0.0261 0.00000343

$Cluster11
# A tibble: 2 x 6
  Cluster   gene_name  p_value morans_test_statistic morans_I  q_value
  <chr>     <chr>        <dbl>                 <dbl>    <dbl>    <dbl>
1 Cluster11 Vimp      3.17e-62                 16.6    0.0948 1.62e-61
2 Cluster11 Fgfr1op2  2.07e- 8                  5.48   0.0310 4.98e- 8

$Cluster12
# A tibble: 1 x 6
  Cluster   gene_name p_value morans_test_statistic morans_I q_value
  <chr>     <chr>       <dbl>                 <dbl>    <dbl>   <dbl>
1 Cluster12 Pikfyve    0.0147                  2.18   0.0120  0.0245

$Cluster6
# A tibble: 1 x 6
  Cluster  gene_name  p_value morans_test_statistic morans_I  q_value
  <chr>    <chr>        <dbl>                 <dbl>    <dbl>    <dbl>
1 Cluster6 Zfp398    0.000354                  3.39   0.0188 0.000684

$Cluster8
# A tibble: 2 x 6
  Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
  <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
1 Cluster8 Golga7    4.14e-  6                  4.46   0.0251 8.96e-  6
2 Cluster8 Lars2     3.93e-184                 28.9    0.165  3.48e-183

$Cluster9
# A tibble: 3 x 6
  Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
  <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
1 Cluster9 Tbc1d8    3.47e- 47                  14.4   0.0815 1.58e- 46
2 Cluster9 H1f0      9.46e-131                  24.3   0.139  7.00e-130
3 Cluster9 Ankrd13a  1.43e- 38                  12.9   0.0737 5.96e- 38

答案 2 :(得分:1)

不确定,是否可以直接进行。一种方法是对数据框进行采样,然后将其unique名称用作setNames

library(dplyr)

df <- iris %>% sample_n(size = 5) 

df %>%
   group_split(Species) %>%
   setNames(unique(df$Species))


#$setosa
# A tibble: 1 x 5
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#         <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#1            5         3.4          1.5         0.2 setosa 

#$versicolor
# A tibble: 1 x 5
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
#         <dbl>       <dbl>        <dbl>       <dbl> <fct>     
#1            6         3.4          4.5         1.6 versicolor

#$virginica
# A tibble: 3 x 5
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
#         <dbl>       <dbl>        <dbl>       <dbl> <fct>    
#1          7.3         2.9          6.3         1.8 virginica
#2          6.9         3.1          5.1         2.3 virginica
#3          7.7         3            6.1         2.3 virginica

奇怪的是,group_split没有直接命名列表,因为它被认为是base::split的替代名称。

split(df, df$Species)

文档说:

  

group_split()的工作方式类似于base :: split(),但

  • 它使用group_by()中的分组结构,因此要遵守数据掩码
  • 它没有基于分组来命名列表的元素,因为这通常会丢失信息并且令人困惑。

对于更新的数据集,它不起作用,因为命名时我们使用的是unique,它以与出现的顺序相同的顺序获取数据,而group_split则根据其递增顺序对数据进行拆分值。 (因此,拆分顺序为Cluster1Cluster11Cluster2...。)解决该问题的一种方法是将Cluster转换为factor并指定{ {1}}与使用levels时显示的一样。

unique

或者如果您不希望它们成为因素,那么

df <- df %>%
      mutate(Cluster = factor(Cluster, levels = unique(Cluster))) 

df %>%
   group_split(Cluster) %>%
   setNames(unique(df$Cluster))

答案 3 :(得分:1)

使用Extensions and Updates循环访问每个df中Cluster的唯一元素,然后将它们分配为各自的名称。

const convertTime12to24 = (time12h) => {
      const [fullMatch, time, modifier] = time12h.match(/(\d?\d:\d\d)\s*(\w{2})/i);

      let [hours, minutes] = time.split(':');

      if (hours === '12') {
        hours = '00';
      }

      if (modifier === 'PM') {
        hours = parseInt(hours, 10) + 12;
      }

      return `${hours}:${minutes}`;
    }

    console.log(convertTime12to24('01:02 PM'));
    console.log(convertTime12to24('05:06 PM'));
    console.log(convertTime12to24('12:00 PM'));
    console.log(convertTime12to24('12:00 AM'));

答案 4 :(得分:1)

开发人员已经明确表示,他们不希望提供返回命名列表的选项。再次,我想提出一个功能请求,但旧问题已锁定here

我想出的一个技巧就是将赋值运算符放在管道中:

library(tidyverse)
iris %>% 
  sample_n(size = 5) %>% 
  group_split(Species, .keep = TRUE) %>%
  `names<-`({.} %>% map(~ .x$Species[1]) %>% unlist()) %>%
  ## If you want to discard the grouping variable, do the following step as well
  map(~ .x %>% select(-Species))

要记住的不是直观的答案,但这会使它整洁地放在管道中。

答案 5 :(得分:0)

如果您想将数据框拆分为多个组并具有命名列表,tidytable 包具有用于此目的的 group_split.() 函数。

### pacman will check and install missing packages if needed
if (!require("pacman")) install.packages("pacman")
pacman::p_load(gapminder)
pacman::p_load(tidytable)

分成一组。使用选项 .keep

保留数据框中的组
gapminder_split_1group <- gapminder %>% 
  group_split.(continent, .keep = FALSE, .named = TRUE)
gapminder_split_1group
#> $Asia
#> # A tidytable: 396 x 5
#>    country      year lifeExp      pop gdpPercap
#>    <fct>       <int>   <dbl>    <int>     <dbl>
#>  1 Afghanistan  1952    28.8  8425333      779.
#>  2 Afghanistan  1957    30.3  9240934      821.
#>  3 Afghanistan  1962    32.0 10267083      853.
#>  4 Afghanistan  1967    34.0 11537966      836.
#>  5 Afghanistan  1972    36.1 13079460      740.
#>  6 Afghanistan  1977    38.4 14880372      786.
#>  7 Afghanistan  1982    39.9 12881816      978.
#>  8 Afghanistan  1987    40.8 13867957      852.
#>  9 Afghanistan  1992    41.7 16317921      649.
#> 10 Afghanistan  1997    41.8 22227415      635.
#> # ... with 386 more rows
#> 
#> $Europe
#> # A tidytable: 360 x 5
#>    country  year lifeExp     pop gdpPercap
#>    <fct>   <int>   <dbl>   <int>     <dbl>
#>  1 Albania  1952    55.2 1282697     1601.
#>  2 Albania  1957    59.3 1476505     1942.
#>  3 Albania  1962    64.8 1728137     2313.
#>  4 Albania  1967    66.2 1984060     2760.
#>  5 Albania  1972    67.7 2263554     3313.
#>  6 Albania  1977    68.9 2509048     3533.
#>  7 Albania  1982    70.4 2780097     3631.
#>  8 Albania  1987    72   3075321     3739.
#>  9 Albania  1992    71.6 3326498     2497.
#> 10 Albania  1997    73.0 3428038     3193.
#> # ... with 350 more rows
#> 
#> $Africa
#> # A tidytable: 624 x 5
#>    country  year lifeExp      pop gdpPercap
#>    <fct>   <int>   <dbl>    <int>     <dbl>
#>  1 Algeria  1952    43.1  9279525     2449.
#>  2 Algeria  1957    45.7 10270856     3014.
#>  3 Algeria  1962    48.3 11000948     2551.
#>  4 Algeria  1967    51.4 12760499     3247.
#>  5 Algeria  1972    54.5 14760787     4183.
#>  6 Algeria  1977    58.0 17152804     4910.
#>  7 Algeria  1982    61.4 20033753     5745.
#>  8 Algeria  1987    65.8 23254956     5681.
#>  9 Algeria  1992    67.7 26298373     5023.
#> 10 Algeria  1997    69.2 29072015     4797.
#> # ... with 614 more rows
#> 
#> $Americas
#> # A tidytable: 300 x 5
#>    country    year lifeExp      pop gdpPercap
#>    <fct>     <int>   <dbl>    <int>     <dbl>
#>  1 Argentina  1952    62.5 17876956     5911.
#>  2 Argentina  1957    64.4 19610538     6857.
#>  3 Argentina  1962    65.1 21283783     7133.
#>  4 Argentina  1967    65.6 22934225     8053.
#>  5 Argentina  1972    67.1 24779799     9443.
#>  6 Argentina  1977    68.5 26983828    10079.
#>  7 Argentina  1982    69.9 29341374     8998.
#>  8 Argentina  1987    70.8 31620918     9140.
#>  9 Argentina  1992    71.9 33958947     9308.
#> 10 Argentina  1997    73.3 36203463    10967.
#> # ... with 290 more rows
#> 
#> $Oceania
#> # A tidytable: 24 x 5
#>    country    year lifeExp      pop gdpPercap
#>    <fct>     <int>   <dbl>    <int>     <dbl>
#>  1 Australia  1952    69.1  8691212    10040.
#>  2 Australia  1957    70.3  9712569    10950.
#>  3 Australia  1962    70.9 10794968    12217.
#>  4 Australia  1967    71.1 11872264    14526.
#>  5 Australia  1972    71.9 13177000    16789.
#>  6 Australia  1977    73.5 14074100    18334.
#>  7 Australia  1982    74.7 15184200    19477.
#>  8 Australia  1987    76.3 16257249    21889.
#>  9 Australia  1992    77.6 17481977    23425.
#> 10 Australia  1997    78.8 18565243    26998.
#> # ... with 14 more rows

分成两组

gapminder_split_2group <- gapminder %>% 
  group_split.(continent, country, .keep = FALSE, .named = TRUE)
head(gapminder_split_2group)
#> $Asia.Afghanistan
#> # A tidytable: 12 x 4
#>     year lifeExp      pop gdpPercap
#>    <int>   <dbl>    <int>     <dbl>
#>  1  1952    28.8  8425333      779.
#>  2  1957    30.3  9240934      821.
#>  3  1962    32.0 10267083      853.
#>  4  1967    34.0 11537966      836.
#>  5  1972    36.1 13079460      740.
#>  6  1977    38.4 14880372      786.
#>  7  1982    39.9 12881816      978.
#>  8  1987    40.8 13867957      852.
#>  9  1992    41.7 16317921      649.
#> 10  1997    41.8 22227415      635.
#> 11  2002    42.1 25268405      727.
#> 12  2007    43.8 31889923      975.
#> 
#> $Europe.Albania
#> # A tidytable: 12 x 4
#>     year lifeExp     pop gdpPercap
#>    <int>   <dbl>   <int>     <dbl>
#>  1  1952    55.2 1282697     1601.
#>  2  1957    59.3 1476505     1942.
#>  3  1962    64.8 1728137     2313.
#>  4  1967    66.2 1984060     2760.
#>  5  1972    67.7 2263554     3313.
#>  6  1977    68.9 2509048     3533.
#>  7  1982    70.4 2780097     3631.
#>  8  1987    72   3075321     3739.
#>  9  1992    71.6 3326498     2497.
#> 10  1997    73.0 3428038     3193.
#> 11  2002    75.7 3508512     4604.
#> 12  2007    76.4 3600523     5937.
#> 
#> $Africa.Algeria
#> # A tidytable: 12 x 4
#>     year lifeExp      pop gdpPercap
#>    <int>   <dbl>    <int>     <dbl>
#>  1  1952    43.1  9279525     2449.
#>  2  1957    45.7 10270856     3014.
#>  3  1962    48.3 11000948     2551.
#>  4  1967    51.4 12760499     3247.
#>  5  1972    54.5 14760787     4183.
#>  6  1977    58.0 17152804     4910.
#>  7  1982    61.4 20033753     5745.
#>  8  1987    65.8 23254956     5681.
#>  9  1992    67.7 26298373     5023.
#> 10  1997    69.2 29072015     4797.
#> 11  2002    71.0 31287142     5288.
#> 12  2007    72.3 33333216     6223.
#> 
#> $Africa.Angola
#> # A tidytable: 12 x 4
#>     year lifeExp      pop gdpPercap
#>    <int>   <dbl>    <int>     <dbl>
#>  1  1952    30.0  4232095     3521.
#>  2  1957    32.0  4561361     3828.
#>  3  1962    34    4826015     4269.
#>  4  1967    36.0  5247469     5523.
#>  5  1972    37.9  5894858     5473.
#>  6  1977    39.5  6162675     3009.
#>  7  1982    39.9  7016384     2757.
#>  8  1987    39.9  7874230     2430.
#>  9  1992    40.6  8735988     2628.
#> 10  1997    41.0  9875024     2277.
#> 11  2002    41.0 10866106     2773.
#> 12  2007    42.7 12420476     4797.
#> 
#> $Americas.Argentina
#> # A tidytable: 12 x 4
#>     year lifeExp      pop gdpPercap
#>    <int>   <dbl>    <int>     <dbl>
#>  1  1952    62.5 17876956     5911.
#>  2  1957    64.4 19610538     6857.
#>  3  1962    65.1 21283783     7133.
#>  4  1967    65.6 22934225     8053.
#>  5  1972    67.1 24779799     9443.
#>  6  1977    68.5 26983828    10079.
#>  7  1982    69.9 29341374     8998.
#>  8  1987    70.8 31620918     9140.
#>  9  1992    71.9 33958947     9308.
#> 10  1997    73.3 36203463    10967.
#> 11  2002    74.3 38331121     8798.
#> 12  2007    75.3 40301927    12779.
#> 
#> $Oceania.Australia
#> # A tidytable: 12 x 4
#>     year lifeExp      pop gdpPercap
#>    <int>   <dbl>    <int>     <dbl>
#>  1  1952    69.1  8691212    10040.
#>  2  1957    70.3  9712569    10950.
#>  3  1962    70.9 10794968    12217.
#>  4  1967    71.1 11872264    14526.
#>  5  1972    71.9 13177000    16789.
#>  6  1977    73.5 14074100    18334.
#>  7  1982    74.7 15184200    19477.
#>  8  1987    76.3 16257249    21889.
#>  9  1992    77.6 17481977    23425.
#> 10  1997    78.8 18565243    26998.
#> 11  2002    80.4 19546792    30688.
#> 12  2007    81.2 20434176    34435.

reprex package (v2.0.0) 于 2021 年 4 月 15 日创建

答案 6 :(得分:0)

一个可选的附加解决方案来摆脱额外的列:

iris %>% sample_n(size = 5) %>%
split(.$Species) %>%
map(~select(., -Species))