我想绘制一个连续向量作为离散值。 为此,我试图通过在因子范围内对其进行变换来离散化连续向量。
我正在尝试分解一个介于0和1之间的双精度向量。
我正在尝试使用cut
函数。
数据:
structure(list(label = c("WP_078201646.1..87-312", "WP_077753210.1..91-300",
"WP_044287879.1..90-306", "WP_046711496.1..56-299", "WP_069060785.1..87-301",
"WP_011394873.1..91-301", "WP_015146987.1..159-358", "WP_085748967.1..86-314",
"NP_696283.1..85-318", "WP_011925568.1..89-315", "WP_013040867.1..89-307",
"WP_062116680.1..85-302", "WP_082057246.1..88-313", "WP_079078020.1..79-301",
"WP_043081767.1..100-292", "WP_085760186.1..96-309", "WP_052427986.1..92-305",
"WP_071039302.1..84-306", "WP_012939355.1..84-312", "WP_012630775.1..85-305"
), full = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), e15 = c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), e20 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), id_0cov_0.8evalue_0.001 = c(1L, 2L, 4L, 5L, 6L,
9L, 11L, 13L, 14L, 17L, 19L, 22L, 23L, 25L, 31L, 37L, 38L, 42L,
44L, 45L), `archConsensus1e-3` = c("LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"PBP_like", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate"), hhArch = c("LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "PBP_like", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate",
"LysR_substrate", "LysR_substrate", "LysR_substrate"), cache_rate = c(0.00383141762452107,
0, 0, 0.0123681338668607, 0.00512820512820513, 0.0254545454545455,
0.00940438871473354, 0, 0.0571428571428571, 0.00519930675909879,
0, 0.00363636363636364, 0.0357142857142857, 0, 0, 0, 0.0535714285714286,
0, 0.00393700787401575, 0), groupsize = c(261L, 28L, 351L, 2749L,
195L, 275L, 638L, 55L, 525L, 577L, 16L, 275L, 196L, 68L, 3L,
26L, 56L, 512L, 254L, 245L), `periprate1e-3` = c(0.0613026819923372,
0.285714285714286, 0.247863247863248, 0.182975627500909, 0.0358974358974359,
0.254545454545455, 0.0125391849529781, 0, 0.157794676806084,
0.131715771230503, 0.0625, 0.0654545454545455, 0.38265306122449,
0.0735294117647059, 0, 0.0384615384615385, 0.0535714285714286,
0.09765625, 0.259842519685039, 0.257142857142857)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x55ccd018d230>)
我首先尝试的代码是:
library(tidyverse)
data %>%
mutate(
cache_rate = cut(cache_rate, breaks = seq(0 , 1, by = 0.1)),
`periprate1e-3` = cut(`periprate1e-3`, breaks = seq(0 , 1, by = 0.1))
)
但是它带给我一些NA
值:
# A tibble: 20 x 10
label full e15 e20 id_0cov_0.8evalue_0… `archConsensus1e… hhArch cache_rate groupsize `periprate1e-3`
<chr> <int> <int> <int> <int> <chr> <chr> <fct> <int> <fct>
1 WP_078201646.… 1 2 1 1 LysR_substrate LysR_subs… (0,0.1] 261 (0,0.1]
2 WP_077753210.… 1 2 1 2 LysR_substrate LysR_subs… NA 28 (0.2,0.3]
3 WP_044287879.… 1 2 1 4 LysR_substrate LysR_subs… NA 351 (0.2,0.3]
4 WP_046711496.… 1 2 1 5 LysR_substrate LysR_subs… (0,0.1] 2749 (0.1,0.2]
5 WP_069060785.… 1 2 1 6 LysR_substrate LysR_subs… (0,0.1] 195 (0,0.1]
6 WP_011394873.… 1 2 1 9 LysR_substrate LysR_subs… (0,0.1] 275 (0.2,0.3]
7 WP_015146987.… 1 2 1 11 PBP_like PBP_like (0,0.1] 638 (0,0.1]
8 WP_085748967.… 1 2 1 13 LysR_substrate LysR_subs… NA 55 NA
9 NP_696283.1..… 1 2 1 14 LysR_substrate LysR_subs… (0,0.1] 525 (0.1,0.2]
10 WP_011925568.… 1 2 1 17 LysR_substrate LysR_subs… (0,0.1] 577 (0.1,0.2]
11 WP_013040867.… 1 2 1 19 LysR_substrate LysR_subs… NA 16 (0,0.1]
12 WP_062116680.… 1 2 1 22 LysR_substrate LysR_subs… (0,0.1] 275 (0,0.1]
13 WP_082057246.… 1 2 1 23 LysR_substrate LysR_subs… (0,0.1] 196 (0.3,0.4]
14 WP_079078020.… 1 2 1 25 LysR_substrate LysR_subs… NA 68 (0,0.1]
15 WP_043081767.… 1 2 1 31 LysR_substrate LysR_subs… NA 3 NA
16 WP_085760186.… 1 2 1 37 LysR_substrate LysR_subs… NA 26 (0,0.1]
17 WP_052427986.… 1 2 1 38 LysR_substrate LysR_subs… (0,0.1] 56 (0,0.1]
18 WP_071039302.… 1 2 1 42 LysR_substrate LysR_subs… NA 512 (0,0.1]
19 WP_012939355.… 1 2 1 44 LysR_substrate LysR_subs… (0,0.1] 254 (0.2,0.3]
20 WP_012630775.… 1 2 1 45 LysR_substrate LysR_subs… NA 245 (0.2,0.3]
然后我尝试通过更改cut
函数中的范围来解决此问题:
data %>%
mutate(
cache_rate = cut(cache_rate, breaks = seq(-0.9 , 1, by = 0.1)),
`periprate1e-3` = cut(`periprate1e-3`, breaks = seq(-0.9 , 1, by = 0.1))
)
但是给定负值,结果并不太明显:
# A tibble: 20 x 10
label full e15 e20 id_0cov_0.8evalue_0… `archConsensus1e… hhArch cache_rate groupsize `periprate1e-3`
<chr> <int> <int> <int> <int> <chr> <chr> <fct> <int> <fct>
1 WP_078201646.… 1 2 1 1 LysR_substrate LysR_subs… (0,0.1] 261 (0,0.1]
2 WP_077753210.… 1 2 1 2 LysR_substrate LysR_subs… (-0.1,0] 28 (0.2,0.3]
3 WP_044287879.… 1 2 1 4 LysR_substrate LysR_subs… (-0.1,0] 351 (0.2,0.3]
4 WP_046711496.… 1 2 1 5 LysR_substrate LysR_subs… (0,0.1] 2749 (0.1,0.2]
5 WP_069060785.… 1 2 1 6 LysR_substrate LysR_subs… (0,0.1] 195 (0,0.1]
6 WP_011394873.… 1 2 1 9 LysR_substrate LysR_subs… (0,0.1] 275 (0.2,0.3]
7 WP_015146987.… 1 2 1 11 PBP_like PBP_like (0,0.1] 638 (0,0.1]
8 WP_085748967.… 1 2 1 13 LysR_substrate LysR_subs… (-0.1,0] 55 (-0.1,0]
9 NP_696283.1..… 1 2 1 14 LysR_substrate LysR_subs… (0,0.1] 525 (0.1,0.2]
10 WP_011925568.… 1 2 1 17 LysR_substrate LysR_subs… (0,0.1] 577 (0.1,0.2]
11 WP_013040867.… 1 2 1 19 LysR_substrate LysR_subs… (-0.1,0] 16 (0,0.1]
12 WP_062116680.… 1 2 1 22 LysR_substrate LysR_subs… (0,0.1] 275 (0,0.1]
13 WP_082057246.… 1 2 1 23 LysR_substrate LysR_subs… (0,0.1] 196 (0.3,0.4]
14 WP_079078020.… 1 2 1 25 LysR_substrate LysR_subs… (-0.1,0] 68 (0,0.1]
15 WP_043081767.… 1 2 1 31 LysR_substrate LysR_subs… (-0.1,0] 3 (-0.1,0]
16 WP_085760186.… 1 2 1 37 LysR_substrate LysR_subs… (-0.1,0] 26 (0,0.1]
17 WP_052427986.… 1 2 1 38 LysR_substrate LysR_subs… (0,0.1] 56 (0,0.1]
18 WP_071039302.… 1 2 1 42 LysR_substrate LysR_subs… (-0.1,0] 512 (0,0.1]
19 WP_012939355.… 1 2 1 44 LysR_substrate LysR_subs… (0,0.1] 254 (0.2,0.3]
20 WP_012630775.… 1 2 1 45 LysR_substrate LysR_subs… (-0.1,0] 245 (0.2,0.3]
data %>%
mutate(
cache_rate2 = cut(cache_rate, breaks = seq(-0.9 , 1, by = 0.1)),
`periprate1e-3_2` = cut(`periprate1e-3`, breaks = seq(-0.9 , 1, by = 0.1))
) %>%
ggplot(aes(cache_rate, `periprate1e-3`, color = cache_rate2, shape = `periprate1e-3_2`)) +
geom_point()
在没有mutate
充满令人不安的case_when
的情况下,如何离散化此向量。
预先感谢
答案 0 :(得分:3)
您将获得NA
,因为默认情况下cut
函数会排除第一个中断的最小值的值。如果您添加include.lowest = TRUE
,问题将消失:
data %>%
mutate(
cache_rate = cut(cache_rate, breaks = 0:10/10, include.lowest = TRUE),
`periprate1e-3` = cut(`periprate1e-3`, breaks = 0:10/10, include.lowest = TRUE)
)
#> # A tibble: 20 x 10
#> label full e15 e20 id_0cov_0.8eval~ `archConsensus1~ hhArch cache_rate
#> <chr> <int> <int> <int> <int> <chr> <chr> <fct>
#> 1 WP_0~ 1 2 1 1 LysR_substrate LysR_~ [0,0.1]
#> 2 WP_0~ 1 2 1 2 LysR_substrate LysR_~ [0,0.1]
#> 3 WP_0~ 1 2 1 4 LysR_substrate LysR_~ [0,0.1]
#> 4 WP_0~ 1 2 1 5 LysR_substrate LysR_~ [0,0.1]
#> 5 WP_0~ 1 2 1 6 LysR_substrate LysR_~ [0,0.1]
#> 6 WP_0~ 1 2 1 9 LysR_substrate LysR_~ [0,0.1]
#> 7 WP_0~ 1 2 1 11 PBP_like PBP_l~ [0,0.1]
#> 8 WP_0~ 1 2 1 13 LysR_substrate LysR_~ [0,0.1]
#> 9 NP_6~ 1 2 1 14 LysR_substrate LysR_~ [0,0.1]
#> 10 WP_0~ 1 2 1 17 LysR_substrate LysR_~ [0,0.1]
#> 11 WP_0~ 1 2 1 19 LysR_substrate LysR_~ [0,0.1]
#> 12 WP_0~ 1 2 1 22 LysR_substrate LysR_~ [0,0.1]
#> 13 WP_0~ 1 2 1 23 LysR_substrate LysR_~ [0,0.1]
#> 14 WP_0~ 1 2 1 25 LysR_substrate LysR_~ [0,0.1]
#> 15 WP_0~ 1 2 1 31 LysR_substrate LysR_~ [0,0.1]
#> 16 WP_0~ 1 2 1 37 LysR_substrate LysR_~ [0,0.1]
#> 17 WP_0~ 1 2 1 38 LysR_substrate LysR_~ [0,0.1]
#> 18 WP_0~ 1 2 1 42 LysR_substrate LysR_~ [0,0.1]
#> 19 WP_0~ 1 2 1 44 LysR_substrate LysR_~ [0,0.1]
#> 20 WP_0~ 1 2 1 45 LysR_substrate LysR_~ [0,0.1]
#> # ... with 2 more variables: groupsize <int>, `periprate1e-3` <fct>