我有一个具有这种结构的地图表:
structure(list(REF_ID = structure(1:10, .Label = c("202533_s_at",
"202534_x_at", "202551_s_at", "202552_s_at", "202555_s_at", "202565_s_at",
"202566_s_at", "202580_x_at", "202581_at", "202589_at"), class = "factor"),
GeneSymbol = structure(c(2L, 2L, 1L, 1L, 5L, 6L, 6L, 3L, 4L, 7L), .Label =
c("CRIM1 /// LOC101929500", "DHFR", "FOXM1", "HSPA1A /// HSPA1B", "MYLK",
"SVIL", "TYMS"), class = "factor")), .Names = c("REF_ID", "GeneSymbol"),
class = "data.frame", row.names = c(NA, -10L))
在第3,4和9行中,有多个GeneSymbol
与单个REF_ID
匹配。 (这里///
是分隔符)。因此,在第3行中,两个基因符号与单个REF_ID
匹配。
我想要一个修改过的表(包含所有现有的映射),这样REF_ID
将重复多次,因为它与单独的基因符号匹配。
因此,我希望第3行包含两个单独的行:一行REF_ID == 202551_s_at
和GeneSymbol == CRIM1
,另一行REF_ID == 202551_s_at
和GeneSymbol == LOC101929500
。
请你帮帮我。
答案 0 :(得分:2)
只是为了补充Rui Barradas的答案,一个整齐的方法可能是使用separate_rows()
包中包含的tidyr
:
library(tidyverse)
df %>% separate_rows(GeneSymbol, sep = " /// ")
#> REF_ID GeneSymbol
#> 1 202533_s_at DHFR
#> 2 202534_x_at DHFR
#> 3 202551_s_at CRIM1
#> 4 202551_s_at LOC101929500
#> 5 202552_s_at CRIM1
#> 6 202552_s_at LOC101929500
#> 7 202555_s_at MYLK
#> 8 202565_s_at SVIL
#> 9 202566_s_at SVIL
#> 10 202580_x_at FOXM1
#> 11 202581_at HSPA1A
#> 12 202581_at HSPA1B
#> 13 202589_at TYMS
数据强>
df <- structure(list(REF_ID = structure(1:10, .Label = c("202533_s_at",
"202534_x_at", "202551_s_at", "202552_s_at", "202555_s_at", "202565_s_at",
"202566_s_at", "202580_x_at", "202581_at", "202589_at"), class = "factor"),
GeneSymbol = structure(c(2L, 2L, 1L, 1L, 5L, 6L, 6L, 3L, 4L, 7L), .Label =
c("CRIM1 /// LOC101929500", "DHFR", "FOXM1", "HSPA1A /// HSPA1B", "MYLK",
"SVIL", "TYMS"), class = "factor")), .Names = c("REF_ID", "GeneSymbol"),
class = "data.frame", row.names = c(NA, -10L))
答案 1 :(得分:1)
以下是您想要的。它仅使用基数R,可能在tidyverse
中有更简单的解决方案。
map$GeneSymbol <- as.character(map$GeneSymbol)
out <- lapply(seq_along(map$GeneSymbol), function(i){
g <- map$GeneSymbol[i]
if(grepl("///", g)){
g <- trimws(unlist(strsplit(g, "///")))
data.frame(REF_ID = rep(map$REF_ID[i], length(g)), GeneSymbol = g)
} else {
data.frame(REF_ID = map$REF_ID[i], GeneSymbol = g)
}
})
map$GeneSymbol <- as.factor(map$GeneSymbol)
out <- do.call(rbind, out)
out
# REF_ID GeneSymbol
#1 202533_s_at DHFR
#2 202534_x_at DHFR
#3 202551_s_at CRIM1
#4 202551_s_at LOC101929500
#5 202552_s_at CRIM1
#6 202552_s_at LOC101929500
#7 202555_s_at MYLK
#8 202565_s_at SVIL
#9 202566_s_at SVIL
#10 202580_x_at FOXM1
#11 202581_at HSPA1A
#12 202581_at HSPA1B
#13 202589_at TYMS