我有一个基因组的数据集,在某些条件下,我会筛选这些基因组以选择最佳基因:
数据:
Group Gene Score direct_count secondary_count
1 AQP11 0.5566507 4 5
1 CLNS1A 0.2811747 0 2
1 RSF1 0.5469924 3 6
2 CFDP1 0.4186066 1 2
2 CHST6 0.4295135 1 3
3 ACE 0.634 1 1
3 NOS2 0.6345 1 1
4 Gene1 0.7 0 1
4 Gene2 0.61 1 0
4 Gene3 0.62 0 1
过滤:
dt %>%
group_by(Group) %>%
filter((max(Score) - Score)<0.05) %>%
slice_max(direct_count, n = 1) %>%
slice_max(secondary_count, n = 1) %>%
ungroup()
我希望能够计算出在上述代码的哪一步中如何过滤基因。
例如,我要用此代码应用的条件是:
如果得分最高的基因与该组中其他所有基因的得分差异大于0.05,则选择得分最高的基因
如果组中排名靠前的基因与任何其他基因之间的得分差异小于0.05,则选择direct_count
较高的基因,仅选择与得分最高的<0.05距离的那些基因之间每组基因
如果direct_count
相同,则选择secondary_count
最高的基因
如果所有计数都相同,则选择所有彼此之间<0.05距离的基因。
我已经能够算出符合我的第一个条件(> 0.05分)的基因了:
new_df <- dt %>%
group_by(Group) %>%
filter((max(Score) - Score)<0.05)
count1 <- new_df[!(duplicated(new_df$Group) | duplicated(new_df$Group, fromLast = TRUE)), ]
我一直在尝试应用类似的规则来获取满足更高direct_count
或更高secondary_count
或匹配direct_count
和secondary_count
的条件中有多少基因的计数,但是我尝试使用的不同代码给出了不同的数字,所以我不确定最好的方法是什么。
输入数据:
#Input data before filtering with code above:
structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L), Gene = c("AQP11",
"CLNS1A", "RSF1", "CFDP1", "CHST6", "ACE", "NOS2", "Gene1","Gene2","Gene3"), Score = c(0.5566507,
0.2811747, 0.5269924, 0.4186066, 0.4295135, 0.634, 0.6345, 0.7, 0.62, 0.61), direct_count = c(4L,
0L, 3L, 1L, 1L, 1L, 1L, 0L, 1L, 0L), secondary_count = c(5L, 2L, 6L, 2L,
3L, 1L, 1L, 0L, 0L, 1L)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"))
#Input data after filtering with code applied above:
structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L,
4L), Gene = c("AQP11", "CLNS1A", "RSF1", "CFDP1", "CHST6", "ACE",
"NOS2", "Gene1", "Gene2", "Gene3"), Score = c(0.5566507, 0.2811747,
0.5269924, 0.4186066, 0.4295135, 0.634, 0.6345, 0.7, 0.62, 0.61
), direct_count = c(4L, 0L, 3L, 1L, 1L, 1L, 1L, 0L, 1L, 0L),
secondary_count = c(5L, 2L, 6L, 2L, 3L, 1L, 1L, 0L, 0L, 1L
)), row.names = c(NA, -10L), class = c("data.table", "data.frame"
))
我制作的此示例数据应具有1个按> 0.05得分选择的基因组,1个被较大的direct_count
过滤的基因组和2个由secondary_count
过滤的基因组。理想情况下,我的目标是计算组的数量并能够将其从数据集中拉出。
示例的输出将只是一个计数:
Genes filtered by >0.05 score: 1
Genes filtered by direct_count: 1
Genes filtered by secondary_count: 2
答案 0 :(得分:4)
基本上,在每个过滤器之前和之后,您都可以计算可用的行数,并将其保存在新列中。
library(dplyr)
dt %>%
group_by(Group) %>%
mutate(filter0 = n()) %>%
filter((max(Score) - Score)<0.05) %>%
mutate(filter1 = n()) %>%
slice_max(direct_count, n = 1) %>%
mutate(filter2 = n()) %>%
slice_max(secondary_count, n = 1) %>%
mutate(filter3 = n()) %>%
ungroup()
#> # A tibble: 5 x 9
#> Group Gene Score direct_count secondary_count filter0 filter1 filter2 filter3
#> <int> <chr> <dbl> <int> <int> <int> <int> <int> <int>
#> 1 1 AQP11 0.557 4 5 3 2 1 1
#> 2 2 CHST6 0.430 1 3 2 2 2 1
#> 3 3 ACE 0.634 1 1 2 2 2 2
#> 4 3 NOS2 0.634 1 1 2 2 2 2
#> 5 4 Gene1 0.7 0 0 3 1 1 1
或者您可以通过这种方式跟踪过滤器。 每列显示在每次过滤时是否选择了该行。
library(dplyr)
dt %>%
group_by(Group) %>%
mutate(filter1 = (max(Score) - Score)<0.05) %>%
mutate(filter2 = rank(-replace(direct_count, !filter1, -Inf), ties.method = "min") == 1) %>%
mutate(filter3 = rank(-replace(secondary_count, !filter2, -Inf), ties.method = "min") == 1) %>%
ungroup()
#> # A tibble: 10 x 8
#> Group Gene Score direct_count secondary_count filter1 filter2 filter3
#> <int> <chr> <dbl> <int> <int> <lgl> <lgl> <lgl>
#> 1 1 AQP11 0.557 4 5 TRUE TRUE TRUE
#> 2 1 CLNS1A 0.281 0 2 FALSE FALSE FALSE
#> 3 1 RSF1 0.527 3 6 TRUE FALSE FALSE
#> 4 2 CFDP1 0.419 1 2 TRUE TRUE FALSE
#> 5 2 CHST6 0.430 1 3 TRUE TRUE TRUE
#> 6 3 ACE 0.634 1 1 TRUE TRUE TRUE
#> 7 3 NOS2 0.634 1 1 TRUE TRUE TRUE
#> 8 4 Gene1 0.7 0 0 TRUE TRUE TRUE
#> 9 4 Gene2 0.62 1 0 FALSE FALSE FALSE
#> 10 4 Gene3 0.61 0 1 FALSE FALSE FALSE
如果按最后一列(filter3
进行过滤,您实际上会得到与您在问题中共享的dplyr
管道相同的输出。
library(dplyr)
dt %>%
group_by(Group) %>%
mutate(filter1 = (max(Score) - Score)<0.05) %>%
mutate(filter2 = rank(-replace(direct_count, !filter1, -Inf), ties.method = "min") == 1) %>%
mutate(filter3 = rank(-replace(secondary_count, !filter2, -Inf), ties.method = "min") == 1) %>%
ungroup() %>%
filter(filter3)
#> # A tibble: 5 x 8
#> Group Gene Score direct_count secondary_count filter1 filter2 filter3
#> <int> <chr> <dbl> <int> <int> <lgl> <lgl> <lgl>
#> 1 1 AQP11 0.557 4 5 TRUE TRUE TRUE
#> 2 2 CHST6 0.430 1 3 TRUE TRUE TRUE
#> 3 3 ACE 0.634 1 1 TRUE TRUE TRUE
#> 4 3 NOS2 0.634 1 1 TRUE TRUE TRUE
#> 5 4 Gene1 0.7 0 0 TRUE TRUE TRUE
如果您更容易直观地看到过滤器如何演变,请记住您可以使用group_split
拆分数据,如下所示:
library(dplyr)
dt %>%
group_by(Group) %>%
mutate(filter1 = (max(Score) - Score)<0.05) %>%
mutate(filter2 = rank(-replace(direct_count, !filter1, -Inf), ties.method = "min") == 1) %>%
mutate(filter3 = rank(-replace(secondary_count, !filter2, -Inf), ties.method = "min") == 1) %>%
group_split()
输出:
<list_of<
tbl_df<
Group : integer
Gene : character
Score : double
direct_count : integer
secondary_count: integer
filter1 : logical
filter2 : logical
filter3 : logical
>
>[4]>
[[1]]
# A tibble: 3 x 8
Group Gene Score direct_count secondary_count filter1 filter2 filter3
<int> <chr> <dbl> <int> <int> <lgl> <lgl> <lgl>
1 1 AQP11 0.557 4 5 TRUE TRUE TRUE
2 1 CLNS1A 0.281 0 2 FALSE FALSE FALSE
3 1 RSF1 0.527 3 6 TRUE FALSE FALSE
[[2]]
# A tibble: 2 x 8
Group Gene Score direct_count secondary_count filter1 filter2 filter3
<int> <chr> <dbl> <int> <int> <lgl> <lgl> <lgl>
1 2 CFDP1 0.419 1 2 TRUE TRUE FALSE
2 2 CHST6 0.430 1 3 TRUE TRUE TRUE
[[3]]
# A tibble: 2 x 8
Group Gene Score direct_count secondary_count filter1 filter2 filter3
<int> <chr> <dbl> <int> <int> <lgl> <lgl> <lgl>
1 3 ACE 0.634 1 1 TRUE TRUE TRUE
2 3 NOS2 0.634 1 1 TRUE TRUE TRUE
[[4]]
# A tibble: 3 x 8
Group Gene Score direct_count secondary_count filter1 filter2 filter3
<int> <chr> <dbl> <int> <int> <lgl> <lgl> <lgl>
1 4 Gene1 0.7 0 0 TRUE TRUE TRUE
2 4 Gene2 0.62 1 0 FALSE FALSE FALSE
3 4 Gene3 0.61 0 1 FALSE FALSE FALSE
但是,如果您更像一个“视觉”人物,则可以绘制每个组的过滤器的演变情况。
使用geom_tile
创建所选行的热图。
图必须从左到右读取。红色方块是过滤器丢弃的方块。
library(ggplot2)
library(tidyr)
library(dplyr)
dt %>%
group_by(Group) %>%
mutate(filter1 = (max(Score) - Score)<0.05) %>%
mutate(filter2 = rank(-replace(direct_count, !filter1, -Inf), ties.method = "min") == 1) %>%
mutate(filter3 = rank(-replace(secondary_count, !filter2, -Inf), ties.method = "min") == 1) %>%
select(Group, Gene, starts_with("filter")) %>%
pivot_longer(starts_with("filter")) %>%
ggplot() +
geom_tile(aes(x = name, y = Gene, fill = value), colour = "black") +
facet_wrap("Group", scales = "free") +
labs(title = "Gene selected from left to right",
x = "Filters",
y = "Genes",
fill = "Selected")
接下来,我将保留代码以查看每个步骤选择了多少个基因。
此外,作为最后一列,您可以看到在最后选择的基因数量最少的那个过滤器,通过这种方式,您可以看到每个过滤器是确定性基因的次数。
library(dplyr)
dt1 <- dt %>%
group_by(Group) %>%
mutate(filter0 = n()) %>%
mutate(filter1 = (max(Score) - Score)<0.05) %>%
mutate(filter2 = rank(-replace(direct_count, !filter1, -Inf), ties.method = "min") == 1) %>%
mutate(filter3 = rank(-replace(secondary_count, !filter2, -Inf), ties.method = "min") == 1) %>%
# sum the number of genes selected for each filter
group_by(Group) %>%
summarise(across(starts_with("filter"), sum)) %>%
# show the number of the decisive filter!
rowwise() %>%
mutate(definitive = which.min(c_across(starts_with("filter")))-1) %>%
ungroup()
dt1
#> # A tibble: 4 x 6
#> Group filter0 filter1 filter2 filter3 definitive
#> <int> <int> <int> <int> <int> <dbl>
#> 1 1 9 2 1 1 2
#> 2 2 4 2 2 1 3
#> 3 3 4 2 2 2 1
#> 4 4 9 1 1 1 1
count(dt1, definitive)
#> # A tibble: 3 x 2
#> definitive n
#> <dbl> <int>
#> 1 1 2
#> 2 2 1
#> 3 3 1
ggplot(dt1) + geom_bar(aes(definitive))