根据其他列值在df中填充Na值

时间:2020-05-28 22:58:37

标签: r regex dataframe dplyr

我需要R方面的帮助,并根据模式处理列空白信息。 为了不懂这里是一个例子:

       Groups                    Names   COL1 COL2 COL3
1      G1 KB640596.1_2-BICs_-__SP1   <NA>   10   30
2      G1  KB640596.1_32-33_-__SP1  YP_98   NA   NA
3      G1  KB640596.1_32-33_-__SP1  YP_99   NA   NA
4      G1  KB640596.1_32-33_-__SP1 YP_100   NA   NA
5      G1  KB640596.1_32-33_-__SP1 YP_101   NA   NA
6      G1 KB640588.1_2-BICs_-__SP1   <NA>   89   28
7      G1  KB640596.1_38-39_-__SP1 YP_102   NA   NA
8      G1  KB640588.1_38-39_-__SP1 YP_103   NA   NA
9      G1  KB640596.1_21-90_-__SP1 YP_102   NA   NA
10     G1  KB640588.1_78-32_-__SP1 YP_102   NA   NA
11     G1  KB640596.1_89-90_-__SP2 YP_104   90   76
12     G2    LO640571_89-90_-__SP3 YP_100   30   90
13     G2    LO640571_89-90_-__SP3 YP_101   40   10
14     G3    LO640571_89-90_-__SP3   YP_2   29   29
15     G3    LO640571_10-20_-__SP3   YP_2   29   29
16     G3    LO640571_09-99_-__SP3   YP_2   29   29

从这个df中,我想为每个具有-BICs模式的组和每个名称填充其COL2 COL3 values and remplace them to the other Names that have the same content part before the first'_'`

例如:

G1内,只有KB640596.1_2-BICs_-__SP1具有-BICs模式, 然后我在第一个'_'模式之前提取内容并得到:KB640596.1 KB640596.1也以其他名称出现,然后向它们添加COL2COL3值(分别为1030)并得到:

Groups Names COL1 COL2 COL3
G1 KB640596.1_2-BICs_-__SP1 NA 10 30
G1 KB640596.1_2-BICs_-__SP1 YP_98 10 30
G1 KB640596.1_2-BICs_-__SP1 YP_99 10 30
G1 KB640596.1_2-BICs_-__SP1 YP_100 10 30
G1 KB640596.1_2-BICs_-__SP1 YP_101 10 30
G1 KB640588.1_2-BICs_-__SP1 NA 89 28
G1 KB640596.1_2-BICs_-__SP1 YP_102 10 30
G1 KB640588.1_2-BICs_-__SP1 YP_103 89 28
G1 KB640596.1_2-BICs_-__SP1 YP_102 10 30
G1 KB640588.1_2-BICs_-__SP1 YP_102 89 28
G1 KB640596.1_2-BICs_-__SP1 YP_104 90 76
G2 LO640571_89-90_-__SP3 YP_100 30 90
G2 LO640571_89-90_-__SP3 YP_101 40 10
G3 LO640571_89-90_-__SP3   YP_2   29   29
G3 LO640571_10-20_-__SP3   YP_2   29   29
G3 LO640571_09-99_-__SP3   YP_2   29   29

如果有人有想法,那就太好了

数据

   structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L), .Label = c("G1", "G2", "G3"
), class = "factor"), Names = structure(c(4L, 6L, 6L, 6L, 6L, 
1L, 7L, 2L, 5L, 3L, 8L, 11L, 11L, 11L, 10L, 9L), .Label = c("KB640588.1_2-BICs_-__SP1", 
"KB640588.1_38-39_-__SP1", "KB640588.1_78-32_-__SP1", "KB640596.1_2-BICs_-__SP1", 
"KB640596.1_21-90_-__SP1", "KB640596.1_32-33_-__SP1", "KB640596.1_38-39_-__SP1", 
"KB640596.1_89-90_-__SP2", "LO640571_09-99_-__SP3", "LO640571_10-20_-__SP3", 
"LO640571_89-90_-__SP3"), class = "factor"), COL1 = structure(c(NA, 
7L, 8L, 1L, 2L, NA, 3L, 4L, 3L, 3L, 5L, 1L, 2L, 6L, 6L, 6L), .Label = c("YP_100", 
"YP_101", "YP_102", "YP_103", "YP_104", "YP_2", "YP_98", "YP_99"
), class = "factor"), COL2 = c(10L, NA, NA, NA, NA, 89L, NA, 
NA, NA, NA, 90L, 30L, 40L, 29L, 29L, 29L), COL3 = c(30L, NA, 
NA, NA, NA, 28L, NA, NA, NA, NA, 76L, 90L, 10L, 29L, 29L, 29L
)), class = "data.frame", row.names = c(NA, -16L))

2 个答案:

答案 0 :(得分:2)

我们可以基于分隔符(separate)将行号Name-列创建为两列。我们可以将数据分为两组,一组将其中具有"BIC"值的组保留,而另一组则将其不包含。我们按组fillCOL2COL3值,并加入数据以获得最终数据帧。

library(dplyr)
library(tidyr)

df1 <- df %>% mutate(row = row_number())
df2 <- df1 %>%
         separate(Names, paste0('col', 1:2), sep = "-|\\.", extra = "merge") %>%
         group_by(Groups, col1) %>%
         filter(any(grepl('BIC', col2))) %>%
         fill(COL2, COL3) %>%
         mutate(col2 = first(col2)) %>%
         unite(Names, col1, col2, sep = "-")


bind_rows(df2, df1 %>% filter(!row %in% df2$row)) %>%
          arrange(row) %>%
          select(-row)

#   Groups Names                    COL1    COL2  COL3
#   <fct>  <chr>                    <fct>  <int> <int>
# 1 G1     KB640596-1_2-BICs_-__SP1 NA        10    30
# 2 G1     KB640596-1_2-BICs_-__SP1 YP_98     10    30
# 3 G1     KB640596-1_2-BICs_-__SP1 YP_99     10    30
# 4 G1     KB640596-1_2-BICs_-__SP1 YP_100    10    30
# 5 G1     KB640596-1_2-BICs_-__SP1 YP_101    10    30
# 6 G1     KB640588-1_2-BICs_-__SP1 NA        89    28
# 7 G1     KB640596-1_2-BICs_-__SP1 YP_102    10    30
# 8 G1     KB640588-1_2-BICs_-__SP1 YP_103    89    28
# 9 G1     KB640596-1_2-BICs_-__SP1 YP_102    10    30
#10 G1     KB640588-1_2-BICs_-__SP1 YP_102    89    28
#11 G1     KB640596-1_2-BICs_-__SP1 YP_104    90    76
#12 G2     LO640571_89-90_-__SP3    YP_100    30    90
#13 G2     LO640571_89-90_-__SP3    YP_101    40    10
#14 G3     LO640571_89-90_-__SP3    YP_2      29    29
#15 G3     LO640571_10-20_-__SP3    YP_2      29    29
#16 G3     LO640571_09-99_-__SP3    YP_2      29    29

答案 1 :(得分:1)

我们可以基于'Groups'和'Names'的子字符串进行分组,方法是删除后缀部分,然后通过将NA元素替换为'Names'具有{{ 1}}

mutate

或使用-BICs

library(dplyr)
library(stringr)
df %>% 
  group_by(Groups, grp = str_remove(Names, "_\\d+.*")) %>%
   mutate_if(is.numeric, ~         
      replace(., is.na(.), .[str_detect(Names, '-BICs')])) %>%
  ungroup %>%
  select(-grp)
# A tibble: 13 x 5
#   Groups Names                    COL1    COL2  COL3
#   <fct>  <fct>                    <fct>  <int> <int>
# 1 G1     KB640596.1_2-BICs_-__SP1 <NA>      10    30
# 2 G1     KB640596.1_32-33_-__SP1  YP_98     10    30
# 3 G1     KB640596.1_32-33_-__SP1  YP_99     10    30
# 4 G1     KB640596.1_32-33_-__SP1  YP_100    10    30
# 5 G1     KB640596.1_32-33_-__SP1  YP_101    10    30
# 6 G1     KB640588.1_2-BICs_-__SP1 <NA>      89    28
# 7 G1     KB640596.1_38-39_-__SP1  YP_102    10    30
# 8 G1     KB640588.1_38-39_-__SP1  YP_103    89    28
# 9 G1     KB640596.1_21-90_-__SP1  YP_102    10    30
#10 G1     KB640588.1_78-32_-__SP1  YP_102    89    28
#11 G1     KB640596.1_89-90_-__SP2  YP_104    90    76
#12 G2     LO640571_89-90_-__SP3    YP_100    30    90
#13 G2     LO640571_89-90_-__SP3    YP_101    40    10