我有以下使用group_split of dplyr的过程:
library(tidyverse)
set.seed(1)
iris %>% sample_n(size = 5) %>%
group_by(Species) %>%
group_split()
结果是:
[[1]]
# A tibble: 2 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5 3.5 1.6 0.6 setosa
2 5.1 3.8 1.5 0.3 setosa
[[2]]
# A tibble: 2 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5.9 3 4.2 1.5 versicolor
2 6.2 2.2 4.5 1.5 versicolor
[[3]]
# A tibble: 1 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 6.2 3.4 5.4 2.3 virginica
我要实现的是按分组名称(即物种)命名此列表。 产生(手工完成):
$setosa
# A tibble: 2 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5 3.5 1.6 0.6 setosa
2 5.1 3.8 1.5 0.3 setosa
$versicolor
# A tibble: 2 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5.9 3 4.2 1.5 versicolor
2 6.2 2.2 4.5 1.5 versicolor
$virginica
# A tibble: 1 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 6.2 3.4 5.4 2.3 virginica
我该如何实现?
更新
我尝试了这个新数据,现在的命名为Cluster
:
df <- structure(list(Cluster = c("Cluster9", "Cluster11", "Cluster1",
"Cluster9", "Cluster6", "Cluster12", "Cluster9", "Cluster11",
"Cluster8", "Cluster8"), gene_name = c("Tbc1d8", "Vimp", "Grhpr",
"H1f0", "Zfp398", "Pikfyve", "Ankrd13a", "Fgfr1op2", "Golga7",
"Lars2"), p_value = c(3.46629097620496e-47, 3.16837338947245e-62,
1.55108439059684e-06, 9.46078511685542e-131, 0.000354049720507017,
0.0146807415917158, 1.42799750295289e-38, 2.0697825959399e-08,
4.13777221466668e-06, 3.92889640704683e-184), morans_test_statistic = c(14.3797687352223,
16.6057085487911, 4.66393667525872, 24.301453902967, 3.38642377758137,
2.17859882998961, 12.9350063459509, 5.48479186018979, 4.4579286289179,
28.9144540271157), morans_I = c(0.0814728893885783, 0.0947505609609695,
0.0260671534007409, 0.138921824574569, 0.018764800166045, 0.0119813199210325,
0.0736554862590782, 0.0309849638728409, 0.0250591347318986, 0.165310420808725
), q_value = c(1.57917584337356e-46, 1.62106594498462e-61, 3.43312171446844e-06,
6.99503520654745e-130, 0.000683559649593623, 0.0245476826213791,
5.96116678335584e-38, 4.97603701391971e-08, 8.9649490080526e-06,
3.48152096326702e-183)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
使用罗纳克·沙(Ronak Shah)的方法时,结果不一致:
df %>% group_split(Cluster) %>% setNames(unique(df$Cluster))
$Cluster9
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster1 Grhpr 0.00000155 4.66 0.0261 0.00000343
$Cluster11
# A tibble: 2 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster11 Vimp 3.17e-62 16.6 0.0948 1.62e-61
2 Cluster11 Fgfr1op2 2.07e- 8 5.48 0.0310 4.98e- 8
$Cluster1
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster12 Pikfyve 0.0147 2.18 0.0120 0.0245
$Cluster6
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster6 Zfp398 0.000354 3.39 0.0188 0.000684
$Cluster12
# A tibble: 2 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster8 Golga7 4.14e- 6 4.46 0.0251 8.96e- 6
2 Cluster8 Lars2 3.93e-184 28.9 0.165 3.48e-183
$Cluster8
# A tibble: 3 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster9 Tbc1d8 3.47e- 47 14.4 0.0815 1.58e- 46
2 Cluster9 H1f0 9.46e-131 24.3 0.139 7.00e-130
3 Cluster9 Ankrd13a 1.43e- 38 12.9 0.0737 5.96e- 38
请注意,$Cluster9
中包含Cluster1
。
请建议如何处理?
答案 0 :(得分:4)
我遇到了同样的问题,并使用了以下两步解决方案:
df= df %>% group_by(Cluster)
df= df %>% group_split() %>% set_names(unlist(group_keys(df)))
df$Cluster1
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster1 Grhpr 0.00000155 4.66 0.0261 0.00000343
df$Cluster9
# A tibble: 3 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster9 Tbc1d8 3.47e- 47 14.4 0.0815 1.58e- 46
2 Cluster9 H1f0 9.46e-131 24.3 0.139 7.00e-130
3 Cluster9 Ankrd13a 1.43e- 38 12.9 0.0737 5.96e- 38
答案 1 :(得分:4)
很多好的答案。你也可以这样做:
iris %>% sample_n(size = 5) %>%
split(f = as.factor(.$Species))
哪个会给你:
$setosa
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
4 5.5 3.5 1.3 0.2 setosa
5 5.3 3.7 1.5 0.2 setosa
$versicolor
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
3 5 2.3 3.3 1 versicolor
$virginica
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 7.7 2.6 6.9 2.3 virginica
2 7.2 3.0 5.8 1.6 virginica
也适用于上面的数据框:
df %>%
split(f = as.factor(.$Cluster))
给你:
$Cluster1
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster1 Grhpr 0.00000155 4.66 0.0261 0.00000343
$Cluster11
# A tibble: 2 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster11 Vimp 3.17e-62 16.6 0.0948 1.62e-61
2 Cluster11 Fgfr1op2 2.07e- 8 5.48 0.0310 4.98e- 8
$Cluster12
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster12 Pikfyve 0.0147 2.18 0.0120 0.0245
$Cluster6
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster6 Zfp398 0.000354 3.39 0.0188 0.000684
$Cluster8
# A tibble: 2 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster8 Golga7 4.14e- 6 4.46 0.0251 8.96e- 6
2 Cluster8 Lars2 3.93e-184 28.9 0.165 3.48e-183
$Cluster9
# A tibble: 3 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster9 Tbc1d8 3.47e- 47 14.4 0.0815 1.58e- 46
2 Cluster9 H1f0 9.46e-131 24.3 0.139 7.00e-130
3 Cluster9 Ankrd13a 1.43e- 38 12.9 0.0737 5.96e- 38
答案 2 :(得分:1)
不确定,是否可以直接进行。一种方法是对数据框进行采样,然后将其unique
名称用作setNames
。
library(dplyr)
df <- iris %>% sample_n(size = 5)
df %>%
group_split(Species) %>%
setNames(unique(df$Species))
#$setosa
# A tibble: 1 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
#1 5 3.4 1.5 0.2 setosa
#$versicolor
# A tibble: 1 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
#1 6 3.4 4.5 1.6 versicolor
#$virginica
# A tibble: 3 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
#1 7.3 2.9 6.3 1.8 virginica
#2 6.9 3.1 5.1 2.3 virginica
#3 7.7 3 6.1 2.3 virginica
奇怪的是,group_split
没有直接命名列表,因为它被认为是base::split
的替代名称。
split(df, df$Species)
文档说:
group_split()的工作方式类似于base :: split(),但
对于更新的数据集,它不起作用,因为命名时我们使用的是unique
,它以与出现的顺序相同的顺序获取数据,而group_split
则根据其递增顺序对数据进行拆分值。 (因此,拆分顺序为Cluster1
,Cluster11
,Cluster2
...。)解决该问题的一种方法是将Cluster
转换为factor
并指定{ {1}}与使用levels
时显示的一样。
unique
或者如果您不希望它们成为因素,那么
df <- df %>%
mutate(Cluster = factor(Cluster, levels = unique(Cluster)))
df %>%
group_split(Cluster) %>%
setNames(unique(df$Cluster))
答案 3 :(得分:1)
使用Extensions and Updates
循环访问每个df中Cluster的唯一元素,然后将它们分配为各自的名称。
const convertTime12to24 = (time12h) => {
const [fullMatch, time, modifier] = time12h.match(/(\d?\d:\d\d)\s*(\w{2})/i);
let [hours, minutes] = time.split(':');
if (hours === '12') {
hours = '00';
}
if (modifier === 'PM') {
hours = parseInt(hours, 10) + 12;
}
return `${hours}:${minutes}`;
}
console.log(convertTime12to24('01:02 PM'));
console.log(convertTime12to24('05:06 PM'));
console.log(convertTime12to24('12:00 PM'));
console.log(convertTime12to24('12:00 AM'));
答案 4 :(得分:1)
开发人员已经明确表示,他们不希望提供返回命名列表的选项。再次,我想提出一个功能请求,但旧问题已锁定here。
我想出的一个技巧就是将赋值运算符放在管道中:
library(tidyverse)
iris %>%
sample_n(size = 5) %>%
group_split(Species, .keep = TRUE) %>%
`names<-`({.} %>% map(~ .x$Species[1]) %>% unlist()) %>%
## If you want to discard the grouping variable, do the following step as well
map(~ .x %>% select(-Species))
要记住的不是直观的答案,但这会使它整洁地放在管道中。
答案 5 :(得分:0)
如果您想将数据框拆分为多个组并具有命名列表,tidytable
包具有用于此目的的 group_split.()
函数。
### pacman will check and install missing packages if needed
if (!require("pacman")) install.packages("pacman")
pacman::p_load(gapminder)
pacman::p_load(tidytable)
分成一组。使用选项 .keep
gapminder_split_1group <- gapminder %>%
group_split.(continent, .keep = FALSE, .named = TRUE)
gapminder_split_1group
#> $Asia
#> # A tidytable: 396 x 5
#> country year lifeExp pop gdpPercap
#> <fct> <int> <dbl> <int> <dbl>
#> 1 Afghanistan 1952 28.8 8425333 779.
#> 2 Afghanistan 1957 30.3 9240934 821.
#> 3 Afghanistan 1962 32.0 10267083 853.
#> 4 Afghanistan 1967 34.0 11537966 836.
#> 5 Afghanistan 1972 36.1 13079460 740.
#> 6 Afghanistan 1977 38.4 14880372 786.
#> 7 Afghanistan 1982 39.9 12881816 978.
#> 8 Afghanistan 1987 40.8 13867957 852.
#> 9 Afghanistan 1992 41.7 16317921 649.
#> 10 Afghanistan 1997 41.8 22227415 635.
#> # ... with 386 more rows
#>
#> $Europe
#> # A tidytable: 360 x 5
#> country year lifeExp pop gdpPercap
#> <fct> <int> <dbl> <int> <dbl>
#> 1 Albania 1952 55.2 1282697 1601.
#> 2 Albania 1957 59.3 1476505 1942.
#> 3 Albania 1962 64.8 1728137 2313.
#> 4 Albania 1967 66.2 1984060 2760.
#> 5 Albania 1972 67.7 2263554 3313.
#> 6 Albania 1977 68.9 2509048 3533.
#> 7 Albania 1982 70.4 2780097 3631.
#> 8 Albania 1987 72 3075321 3739.
#> 9 Albania 1992 71.6 3326498 2497.
#> 10 Albania 1997 73.0 3428038 3193.
#> # ... with 350 more rows
#>
#> $Africa
#> # A tidytable: 624 x 5
#> country year lifeExp pop gdpPercap
#> <fct> <int> <dbl> <int> <dbl>
#> 1 Algeria 1952 43.1 9279525 2449.
#> 2 Algeria 1957 45.7 10270856 3014.
#> 3 Algeria 1962 48.3 11000948 2551.
#> 4 Algeria 1967 51.4 12760499 3247.
#> 5 Algeria 1972 54.5 14760787 4183.
#> 6 Algeria 1977 58.0 17152804 4910.
#> 7 Algeria 1982 61.4 20033753 5745.
#> 8 Algeria 1987 65.8 23254956 5681.
#> 9 Algeria 1992 67.7 26298373 5023.
#> 10 Algeria 1997 69.2 29072015 4797.
#> # ... with 614 more rows
#>
#> $Americas
#> # A tidytable: 300 x 5
#> country year lifeExp pop gdpPercap
#> <fct> <int> <dbl> <int> <dbl>
#> 1 Argentina 1952 62.5 17876956 5911.
#> 2 Argentina 1957 64.4 19610538 6857.
#> 3 Argentina 1962 65.1 21283783 7133.
#> 4 Argentina 1967 65.6 22934225 8053.
#> 5 Argentina 1972 67.1 24779799 9443.
#> 6 Argentina 1977 68.5 26983828 10079.
#> 7 Argentina 1982 69.9 29341374 8998.
#> 8 Argentina 1987 70.8 31620918 9140.
#> 9 Argentina 1992 71.9 33958947 9308.
#> 10 Argentina 1997 73.3 36203463 10967.
#> # ... with 290 more rows
#>
#> $Oceania
#> # A tidytable: 24 x 5
#> country year lifeExp pop gdpPercap
#> <fct> <int> <dbl> <int> <dbl>
#> 1 Australia 1952 69.1 8691212 10040.
#> 2 Australia 1957 70.3 9712569 10950.
#> 3 Australia 1962 70.9 10794968 12217.
#> 4 Australia 1967 71.1 11872264 14526.
#> 5 Australia 1972 71.9 13177000 16789.
#> 6 Australia 1977 73.5 14074100 18334.
#> 7 Australia 1982 74.7 15184200 19477.
#> 8 Australia 1987 76.3 16257249 21889.
#> 9 Australia 1992 77.6 17481977 23425.
#> 10 Australia 1997 78.8 18565243 26998.
#> # ... with 14 more rows
分成两组
gapminder_split_2group <- gapminder %>%
group_split.(continent, country, .keep = FALSE, .named = TRUE)
head(gapminder_split_2group)
#> $Asia.Afghanistan
#> # A tidytable: 12 x 4
#> year lifeExp pop gdpPercap
#> <int> <dbl> <int> <dbl>
#> 1 1952 28.8 8425333 779.
#> 2 1957 30.3 9240934 821.
#> 3 1962 32.0 10267083 853.
#> 4 1967 34.0 11537966 836.
#> 5 1972 36.1 13079460 740.
#> 6 1977 38.4 14880372 786.
#> 7 1982 39.9 12881816 978.
#> 8 1987 40.8 13867957 852.
#> 9 1992 41.7 16317921 649.
#> 10 1997 41.8 22227415 635.
#> 11 2002 42.1 25268405 727.
#> 12 2007 43.8 31889923 975.
#>
#> $Europe.Albania
#> # A tidytable: 12 x 4
#> year lifeExp pop gdpPercap
#> <int> <dbl> <int> <dbl>
#> 1 1952 55.2 1282697 1601.
#> 2 1957 59.3 1476505 1942.
#> 3 1962 64.8 1728137 2313.
#> 4 1967 66.2 1984060 2760.
#> 5 1972 67.7 2263554 3313.
#> 6 1977 68.9 2509048 3533.
#> 7 1982 70.4 2780097 3631.
#> 8 1987 72 3075321 3739.
#> 9 1992 71.6 3326498 2497.
#> 10 1997 73.0 3428038 3193.
#> 11 2002 75.7 3508512 4604.
#> 12 2007 76.4 3600523 5937.
#>
#> $Africa.Algeria
#> # A tidytable: 12 x 4
#> year lifeExp pop gdpPercap
#> <int> <dbl> <int> <dbl>
#> 1 1952 43.1 9279525 2449.
#> 2 1957 45.7 10270856 3014.
#> 3 1962 48.3 11000948 2551.
#> 4 1967 51.4 12760499 3247.
#> 5 1972 54.5 14760787 4183.
#> 6 1977 58.0 17152804 4910.
#> 7 1982 61.4 20033753 5745.
#> 8 1987 65.8 23254956 5681.
#> 9 1992 67.7 26298373 5023.
#> 10 1997 69.2 29072015 4797.
#> 11 2002 71.0 31287142 5288.
#> 12 2007 72.3 33333216 6223.
#>
#> $Africa.Angola
#> # A tidytable: 12 x 4
#> year lifeExp pop gdpPercap
#> <int> <dbl> <int> <dbl>
#> 1 1952 30.0 4232095 3521.
#> 2 1957 32.0 4561361 3828.
#> 3 1962 34 4826015 4269.
#> 4 1967 36.0 5247469 5523.
#> 5 1972 37.9 5894858 5473.
#> 6 1977 39.5 6162675 3009.
#> 7 1982 39.9 7016384 2757.
#> 8 1987 39.9 7874230 2430.
#> 9 1992 40.6 8735988 2628.
#> 10 1997 41.0 9875024 2277.
#> 11 2002 41.0 10866106 2773.
#> 12 2007 42.7 12420476 4797.
#>
#> $Americas.Argentina
#> # A tidytable: 12 x 4
#> year lifeExp pop gdpPercap
#> <int> <dbl> <int> <dbl>
#> 1 1952 62.5 17876956 5911.
#> 2 1957 64.4 19610538 6857.
#> 3 1962 65.1 21283783 7133.
#> 4 1967 65.6 22934225 8053.
#> 5 1972 67.1 24779799 9443.
#> 6 1977 68.5 26983828 10079.
#> 7 1982 69.9 29341374 8998.
#> 8 1987 70.8 31620918 9140.
#> 9 1992 71.9 33958947 9308.
#> 10 1997 73.3 36203463 10967.
#> 11 2002 74.3 38331121 8798.
#> 12 2007 75.3 40301927 12779.
#>
#> $Oceania.Australia
#> # A tidytable: 12 x 4
#> year lifeExp pop gdpPercap
#> <int> <dbl> <int> <dbl>
#> 1 1952 69.1 8691212 10040.
#> 2 1957 70.3 9712569 10950.
#> 3 1962 70.9 10794968 12217.
#> 4 1967 71.1 11872264 14526.
#> 5 1972 71.9 13177000 16789.
#> 6 1977 73.5 14074100 18334.
#> 7 1982 74.7 15184200 19477.
#> 8 1987 76.3 16257249 21889.
#> 9 1992 77.6 17481977 23425.
#> 10 1997 78.8 18565243 26998.
#> 11 2002 80.4 19546792 30688.
#> 12 2007 81.2 20434176 34435.
由 reprex package (v2.0.0) 于 2021 年 4 月 15 日创建
答案 6 :(得分:0)
一个可选的附加解决方案来摆脱额外的列:
iris %>% sample_n(size = 5) %>%
split(.$Species) %>%
map(~select(., -Species))