R避免切入NA

时间:2020-07-29 22:42:03

标签: r ggplot2 tidyverse cut factors

我想绘制一个连续向量作为离散值。 为此,我试图通过在因子范围内对其进行变换来离散化连续向量。

我正在尝试分解一个介于0和1之间的双精度向量。 我正在尝试使用cut函数。

数据:

structure(list(label = c("WP_078201646.1..87-312", "WP_077753210.1..91-300", 
"WP_044287879.1..90-306", "WP_046711496.1..56-299", "WP_069060785.1..87-301", 
"WP_011394873.1..91-301", "WP_015146987.1..159-358", "WP_085748967.1..86-314", 
"NP_696283.1..85-318", "WP_011925568.1..89-315", "WP_013040867.1..89-307", 
"WP_062116680.1..85-302", "WP_082057246.1..88-313", "WP_079078020.1..79-301", 
"WP_043081767.1..100-292", "WP_085760186.1..96-309", "WP_052427986.1..92-305", 
"WP_071039302.1..84-306", "WP_012939355.1..84-312", "WP_012630775.1..85-305"
), full = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), e15 = c(2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), e20 = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), id_0cov_0.8evalue_0.001 = c(1L, 2L, 4L, 5L, 6L, 
9L, 11L, 13L, 14L, 17L, 19L, 22L, 23L, 25L, 31L, 37L, 38L, 42L, 
44L, 45L), `archConsensus1e-3` = c("LysR_substrate", "LysR_substrate", 
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate", 
"PBP_like", "LysR_substrate", "LysR_substrate", "LysR_substrate", 
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate", 
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate", 
"LysR_substrate", "LysR_substrate"), hhArch = c("LysR_substrate", 
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate", 
"LysR_substrate", "PBP_like", "LysR_substrate", "LysR_substrate", 
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate", 
"LysR_substrate", "LysR_substrate", "LysR_substrate", "LysR_substrate", 
"LysR_substrate", "LysR_substrate", "LysR_substrate"), cache_rate = c(0.00383141762452107, 
0, 0, 0.0123681338668607, 0.00512820512820513, 0.0254545454545455, 
0.00940438871473354, 0, 0.0571428571428571, 0.00519930675909879, 
0, 0.00363636363636364, 0.0357142857142857, 0, 0, 0, 0.0535714285714286, 
0, 0.00393700787401575, 0), groupsize = c(261L, 28L, 351L, 2749L, 
195L, 275L, 638L, 55L, 525L, 577L, 16L, 275L, 196L, 68L, 3L, 
26L, 56L, 512L, 254L, 245L), `periprate1e-3` = c(0.0613026819923372, 
0.285714285714286, 0.247863247863248, 0.182975627500909, 0.0358974358974359, 
0.254545454545455, 0.0125391849529781, 0, 0.157794676806084, 
0.131715771230503, 0.0625, 0.0654545454545455, 0.38265306122449, 
0.0735294117647059, 0, 0.0384615384615385, 0.0535714285714286, 
0.09765625, 0.259842519685039, 0.257142857142857)), row.names = c(NA, 
-20L), class = c("tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x55ccd018d230>)

我首先尝试的代码是:

library(tidyverse)

data %>%
mutate(
    cache_rate      = cut(cache_rate,      breaks = seq(0 , 1, by = 0.1)),
    `periprate1e-3` = cut(`periprate1e-3`, breaks = seq(0 , 1, by = 0.1))
  )

但是它带给我一些NA值:

# A tibble: 20 x 10
   label           full   e15   e20 id_0cov_0.8evalue_0… `archConsensus1e… hhArch     cache_rate groupsize `periprate1e-3`
   <chr>          <int> <int> <int>                <int> <chr>             <chr>      <fct>          <int> <fct>          
 1 WP_078201646.…     1     2     1                    1 LysR_substrate    LysR_subs… (0,0.1]          261 (0,0.1]        
 2 WP_077753210.…     1     2     1                    2 LysR_substrate    LysR_subs… NA                28 (0.2,0.3]      
 3 WP_044287879.…     1     2     1                    4 LysR_substrate    LysR_subs… NA               351 (0.2,0.3]      
 4 WP_046711496.…     1     2     1                    5 LysR_substrate    LysR_subs… (0,0.1]         2749 (0.1,0.2]      
 5 WP_069060785.…     1     2     1                    6 LysR_substrate    LysR_subs… (0,0.1]          195 (0,0.1]        
 6 WP_011394873.…     1     2     1                    9 LysR_substrate    LysR_subs… (0,0.1]          275 (0.2,0.3]      
 7 WP_015146987.…     1     2     1                   11 PBP_like          PBP_like   (0,0.1]          638 (0,0.1]        
 8 WP_085748967.…     1     2     1                   13 LysR_substrate    LysR_subs… NA                55 NA             
 9 NP_696283.1..…     1     2     1                   14 LysR_substrate    LysR_subs… (0,0.1]          525 (0.1,0.2]      
10 WP_011925568.…     1     2     1                   17 LysR_substrate    LysR_subs… (0,0.1]          577 (0.1,0.2]      
11 WP_013040867.…     1     2     1                   19 LysR_substrate    LysR_subs… NA                16 (0,0.1]        
12 WP_062116680.…     1     2     1                   22 LysR_substrate    LysR_subs… (0,0.1]          275 (0,0.1]        
13 WP_082057246.…     1     2     1                   23 LysR_substrate    LysR_subs… (0,0.1]          196 (0.3,0.4]      
14 WP_079078020.…     1     2     1                   25 LysR_substrate    LysR_subs… NA                68 (0,0.1]        
15 WP_043081767.…     1     2     1                   31 LysR_substrate    LysR_subs… NA                 3 NA             
16 WP_085760186.…     1     2     1                   37 LysR_substrate    LysR_subs… NA                26 (0,0.1]        
17 WP_052427986.…     1     2     1                   38 LysR_substrate    LysR_subs… (0,0.1]           56 (0,0.1]        
18 WP_071039302.…     1     2     1                   42 LysR_substrate    LysR_subs… NA               512 (0,0.1]        
19 WP_012939355.…     1     2     1                   44 LysR_substrate    LysR_subs… (0,0.1]          254 (0.2,0.3]      
20 WP_012630775.…     1     2     1                   45 LysR_substrate    LysR_subs… NA               245 (0.2,0.3]      

然后我尝试通过更改cut函数中的范围来解决此问题:

data %>% 
  mutate(
    cache_rate      = cut(cache_rate,      breaks = seq(-0.9 , 1, by = 0.1)),
    `periprate1e-3` = cut(`periprate1e-3`, breaks = seq(-0.9 , 1, by = 0.1))
  )

但是给定负值,结果并不太明显:

# A tibble: 20 x 10
   label           full   e15   e20 id_0cov_0.8evalue_0… `archConsensus1e… hhArch     cache_rate groupsize `periprate1e-3`
   <chr>          <int> <int> <int>                <int> <chr>             <chr>      <fct>          <int> <fct>          
 1 WP_078201646.…     1     2     1                    1 LysR_substrate    LysR_subs… (0,0.1]          261 (0,0.1]        
 2 WP_077753210.…     1     2     1                    2 LysR_substrate    LysR_subs… (-0.1,0]          28 (0.2,0.3]      
 3 WP_044287879.…     1     2     1                    4 LysR_substrate    LysR_subs… (-0.1,0]         351 (0.2,0.3]      
 4 WP_046711496.…     1     2     1                    5 LysR_substrate    LysR_subs… (0,0.1]         2749 (0.1,0.2]      
 5 WP_069060785.…     1     2     1                    6 LysR_substrate    LysR_subs… (0,0.1]          195 (0,0.1]        
 6 WP_011394873.…     1     2     1                    9 LysR_substrate    LysR_subs… (0,0.1]          275 (0.2,0.3]      
 7 WP_015146987.…     1     2     1                   11 PBP_like          PBP_like   (0,0.1]          638 (0,0.1]        
 8 WP_085748967.…     1     2     1                   13 LysR_substrate    LysR_subs… (-0.1,0]          55 (-0.1,0]       
 9 NP_696283.1..…     1     2     1                   14 LysR_substrate    LysR_subs… (0,0.1]          525 (0.1,0.2]      
10 WP_011925568.…     1     2     1                   17 LysR_substrate    LysR_subs… (0,0.1]          577 (0.1,0.2]      
11 WP_013040867.…     1     2     1                   19 LysR_substrate    LysR_subs… (-0.1,0]          16 (0,0.1]        
12 WP_062116680.…     1     2     1                   22 LysR_substrate    LysR_subs… (0,0.1]          275 (0,0.1]        
13 WP_082057246.…     1     2     1                   23 LysR_substrate    LysR_subs… (0,0.1]          196 (0.3,0.4]      
14 WP_079078020.…     1     2     1                   25 LysR_substrate    LysR_subs… (-0.1,0]          68 (0,0.1]        
15 WP_043081767.…     1     2     1                   31 LysR_substrate    LysR_subs… (-0.1,0]           3 (-0.1,0]       
16 WP_085760186.…     1     2     1                   37 LysR_substrate    LysR_subs… (-0.1,0]          26 (0,0.1]        
17 WP_052427986.…     1     2     1                   38 LysR_substrate    LysR_subs… (0,0.1]           56 (0,0.1]        
18 WP_071039302.…     1     2     1                   42 LysR_substrate    LysR_subs… (-0.1,0]         512 (0,0.1]        
19 WP_012939355.…     1     2     1                   44 LysR_substrate    LysR_subs… (0,0.1]          254 (0.2,0.3]      
20 WP_012630775.…     1     2     1                   45 LysR_substrate    LysR_subs… (-0.1,0]         245 (0.2,0.3] 

enter image description here

data %>% 
  mutate(
    cache_rate2      = cut(cache_rate,      breaks = seq(-0.9 , 1, by = 0.1)),
    `periprate1e-3_2` = cut(`periprate1e-3`, breaks = seq(-0.9 , 1, by = 0.1))
  ) %>% 
  ggplot(aes(cache_rate, `periprate1e-3`, color = cache_rate2, shape = `periprate1e-3_2`)) +
  geom_point()

在没有mutate充满令人不安的case_when的情况下,如何离散化此向量。

预先感谢

1 个答案:

答案 0 :(得分:3)

您将获得NA,因为默认情况下cut函数会排除第一个中断的最小值的值。如果您添加include.lowest = TRUE,问题将消失:

data %>%
mutate(
    cache_rate      = cut(cache_rate, breaks = 0:10/10, include.lowest = TRUE),
    `periprate1e-3` = cut(`periprate1e-3`, breaks = 0:10/10, include.lowest = TRUE)
  )

#> # A tibble: 20 x 10
#>    label  full   e15   e20 id_0cov_0.8eval~ `archConsensus1~ hhArch cache_rate
#>    <chr> <int> <int> <int>            <int> <chr>            <chr>  <fct>     
#>  1 WP_0~     1     2     1                1 LysR_substrate   LysR_~ [0,0.1]   
#>  2 WP_0~     1     2     1                2 LysR_substrate   LysR_~ [0,0.1]   
#>  3 WP_0~     1     2     1                4 LysR_substrate   LysR_~ [0,0.1]   
#>  4 WP_0~     1     2     1                5 LysR_substrate   LysR_~ [0,0.1]   
#>  5 WP_0~     1     2     1                6 LysR_substrate   LysR_~ [0,0.1]   
#>  6 WP_0~     1     2     1                9 LysR_substrate   LysR_~ [0,0.1]   
#>  7 WP_0~     1     2     1               11 PBP_like         PBP_l~ [0,0.1]   
#>  8 WP_0~     1     2     1               13 LysR_substrate   LysR_~ [0,0.1]   
#>  9 NP_6~     1     2     1               14 LysR_substrate   LysR_~ [0,0.1]   
#> 10 WP_0~     1     2     1               17 LysR_substrate   LysR_~ [0,0.1]   
#> 11 WP_0~     1     2     1               19 LysR_substrate   LysR_~ [0,0.1]   
#> 12 WP_0~     1     2     1               22 LysR_substrate   LysR_~ [0,0.1]   
#> 13 WP_0~     1     2     1               23 LysR_substrate   LysR_~ [0,0.1]   
#> 14 WP_0~     1     2     1               25 LysR_substrate   LysR_~ [0,0.1]   
#> 15 WP_0~     1     2     1               31 LysR_substrate   LysR_~ [0,0.1]   
#> 16 WP_0~     1     2     1               37 LysR_substrate   LysR_~ [0,0.1]   
#> 17 WP_0~     1     2     1               38 LysR_substrate   LysR_~ [0,0.1]   
#> 18 WP_0~     1     2     1               42 LysR_substrate   LysR_~ [0,0.1]   
#> 19 WP_0~     1     2     1               44 LysR_substrate   LysR_~ [0,0.1]   
#> 20 WP_0~     1     2     1               45 LysR_substrate   LysR_~ [0,0.1]   
#> # ... with 2 more variables: groupsize <int>, `periprate1e-3` <fct>