我需要R方面的帮助,并根据模式处理列空白信息。 为了不懂这里是一个例子:
Groups Names COL1 COL2 COL3
1 G1 KB640596.1_2-BICs_-__SP1 <NA> 10 30
2 G1 KB640596.1_32-33_-__SP1 YP_98 NA NA
3 G1 KB640596.1_32-33_-__SP1 YP_99 NA NA
4 G1 KB640596.1_32-33_-__SP1 YP_100 NA NA
5 G1 KB640596.1_32-33_-__SP1 YP_101 NA NA
6 G1 KB640588.1_2-BICs_-__SP1 <NA> 89 28
7 G1 KB640596.1_38-39_-__SP1 YP_102 NA NA
8 G1 KB640588.1_38-39_-__SP1 YP_103 NA NA
9 G1 KB640596.1_21-90_-__SP1 YP_102 NA NA
10 G1 KB640588.1_78-32_-__SP1 YP_102 NA NA
11 G1 KB640596.1_89-90_-__SP2 YP_104 90 76
12 G2 LO640571_89-90_-__SP3 YP_100 30 90
13 G2 LO640571_89-90_-__SP3 YP_101 40 10
14 G3 LO640571_89-90_-__SP3 YP_2 29 29
15 G3 LO640571_10-20_-__SP3 YP_2 29 29
16 G3 LO640571_09-99_-__SP3 YP_2 29 29
从这个df中,我想为每个具有-BICs
模式的组和每个名称填充其COL2
COL3 values and remplace them to the other Names that
have the same content part before the first
'_'`
例如:
在G1
内,只有KB640596.1_2-BICs_-__SP1
具有-BICs
模式,
然后我在第一个'_'
模式之前提取内容并得到:KB640596.1
KB640596.1
也以其他名称出现,然后向它们添加COL2
和COL3
值(分别为10
和30
)并得到:
Groups Names COL1 COL2 COL3
G1 KB640596.1_2-BICs_-__SP1 NA 10 30
G1 KB640596.1_2-BICs_-__SP1 YP_98 10 30
G1 KB640596.1_2-BICs_-__SP1 YP_99 10 30
G1 KB640596.1_2-BICs_-__SP1 YP_100 10 30
G1 KB640596.1_2-BICs_-__SP1 YP_101 10 30
G1 KB640588.1_2-BICs_-__SP1 NA 89 28
G1 KB640596.1_2-BICs_-__SP1 YP_102 10 30
G1 KB640588.1_2-BICs_-__SP1 YP_103 89 28
G1 KB640596.1_2-BICs_-__SP1 YP_102 10 30
G1 KB640588.1_2-BICs_-__SP1 YP_102 89 28
G1 KB640596.1_2-BICs_-__SP1 YP_104 90 76
G2 LO640571_89-90_-__SP3 YP_100 30 90
G2 LO640571_89-90_-__SP3 YP_101 40 10
G3 LO640571_89-90_-__SP3 YP_2 29 29
G3 LO640571_10-20_-__SP3 YP_2 29 29
G3 LO640571_09-99_-__SP3 YP_2 29 29
如果有人有想法,那就太好了
数据
structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L), .Label = c("G1", "G2", "G3"
), class = "factor"), Names = structure(c(4L, 6L, 6L, 6L, 6L,
1L, 7L, 2L, 5L, 3L, 8L, 11L, 11L, 11L, 10L, 9L), .Label = c("KB640588.1_2-BICs_-__SP1",
"KB640588.1_38-39_-__SP1", "KB640588.1_78-32_-__SP1", "KB640596.1_2-BICs_-__SP1",
"KB640596.1_21-90_-__SP1", "KB640596.1_32-33_-__SP1", "KB640596.1_38-39_-__SP1",
"KB640596.1_89-90_-__SP2", "LO640571_09-99_-__SP3", "LO640571_10-20_-__SP3",
"LO640571_89-90_-__SP3"), class = "factor"), COL1 = structure(c(NA,
7L, 8L, 1L, 2L, NA, 3L, 4L, 3L, 3L, 5L, 1L, 2L, 6L, 6L, 6L), .Label = c("YP_100",
"YP_101", "YP_102", "YP_103", "YP_104", "YP_2", "YP_98", "YP_99"
), class = "factor"), COL2 = c(10L, NA, NA, NA, NA, 89L, NA,
NA, NA, NA, 90L, 30L, 40L, 29L, 29L, 29L), COL3 = c(30L, NA,
NA, NA, NA, 28L, NA, NA, NA, NA, 76L, 90L, 10L, 29L, 29L, 29L
)), class = "data.frame", row.names = c(NA, -16L))
答案 0 :(得分:2)
我们可以基于分隔符(separate
)将行号Name
和-
列创建为两列。我们可以将数据分为两组,一组将其中具有"BIC"
值的组保留,而另一组则将其不包含。我们按组fill
,COL2
,COL3
值,并加入数据以获得最终数据帧。
library(dplyr)
library(tidyr)
df1 <- df %>% mutate(row = row_number())
df2 <- df1 %>%
separate(Names, paste0('col', 1:2), sep = "-|\\.", extra = "merge") %>%
group_by(Groups, col1) %>%
filter(any(grepl('BIC', col2))) %>%
fill(COL2, COL3) %>%
mutate(col2 = first(col2)) %>%
unite(Names, col1, col2, sep = "-")
bind_rows(df2, df1 %>% filter(!row %in% df2$row)) %>%
arrange(row) %>%
select(-row)
# Groups Names COL1 COL2 COL3
# <fct> <chr> <fct> <int> <int>
# 1 G1 KB640596-1_2-BICs_-__SP1 NA 10 30
# 2 G1 KB640596-1_2-BICs_-__SP1 YP_98 10 30
# 3 G1 KB640596-1_2-BICs_-__SP1 YP_99 10 30
# 4 G1 KB640596-1_2-BICs_-__SP1 YP_100 10 30
# 5 G1 KB640596-1_2-BICs_-__SP1 YP_101 10 30
# 6 G1 KB640588-1_2-BICs_-__SP1 NA 89 28
# 7 G1 KB640596-1_2-BICs_-__SP1 YP_102 10 30
# 8 G1 KB640588-1_2-BICs_-__SP1 YP_103 89 28
# 9 G1 KB640596-1_2-BICs_-__SP1 YP_102 10 30
#10 G1 KB640588-1_2-BICs_-__SP1 YP_102 89 28
#11 G1 KB640596-1_2-BICs_-__SP1 YP_104 90 76
#12 G2 LO640571_89-90_-__SP3 YP_100 30 90
#13 G2 LO640571_89-90_-__SP3 YP_101 40 10
#14 G3 LO640571_89-90_-__SP3 YP_2 29 29
#15 G3 LO640571_10-20_-__SP3 YP_2 29 29
#16 G3 LO640571_09-99_-__SP3 YP_2 29 29
答案 1 :(得分:1)
我们可以基于'Groups'和'Names'的子字符串进行分组,方法是删除后缀部分,然后通过将NA元素替换为'Names'具有{{ 1}}
mutate
或使用-BICs
library(dplyr)
library(stringr)
df %>%
group_by(Groups, grp = str_remove(Names, "_\\d+.*")) %>%
mutate_if(is.numeric, ~
replace(., is.na(.), .[str_detect(Names, '-BICs')])) %>%
ungroup %>%
select(-grp)
# A tibble: 13 x 5
# Groups Names COL1 COL2 COL3
# <fct> <fct> <fct> <int> <int>
# 1 G1 KB640596.1_2-BICs_-__SP1 <NA> 10 30
# 2 G1 KB640596.1_32-33_-__SP1 YP_98 10 30
# 3 G1 KB640596.1_32-33_-__SP1 YP_99 10 30
# 4 G1 KB640596.1_32-33_-__SP1 YP_100 10 30
# 5 G1 KB640596.1_32-33_-__SP1 YP_101 10 30
# 6 G1 KB640588.1_2-BICs_-__SP1 <NA> 89 28
# 7 G1 KB640596.1_38-39_-__SP1 YP_102 10 30
# 8 G1 KB640588.1_38-39_-__SP1 YP_103 89 28
# 9 G1 KB640596.1_21-90_-__SP1 YP_102 10 30
#10 G1 KB640588.1_78-32_-__SP1 YP_102 89 28
#11 G1 KB640596.1_89-90_-__SP2 YP_104 90 76
#12 G2 LO640571_89-90_-__SP3 YP_100 30 90
#13 G2 LO640571_89-90_-__SP3 YP_101 40 10