我的数据集包含以下部分:
structure(list(domain = c("A1BG_-_-_0", "A1BG_-_-_1", "A1BG_-_-_2",
"A1BG_-_-_3", "A1BG_-_-_4", "A1BG_143228_143228_0", "A1BG_143228_143228_1",
"A1BG_143228_143228_2", "A1BG_143228_143228_3", "A1CF_-_-_0"),
chr = c("19", "19", "19", "19", "19", "19", "19", "19", "19",
"10"), positions = c("(58858387..58858395,58858718..58858719)",
"(58858998..58859006,58861735..58862017,58862756..58862766)",
"(58863018..58863053,58863648..58863673)", "(58863913..58863921,58864293..58864303)",
"(58864552..58864563,58864657..58864693,58864769..58864803)",
"(58858719..58858998)", "(58862766..58863018)", "(58863673..58863913)",
"(58864303..58864552)", "(52566488..52566640,52569653..52569717)"
), length = c(11L, 303L, 62L, 20L, 84L, 280L, 253L, 241L,
250L, 218L)), class = "data.frame", row.names = c(NA, -10L
))
positions
列指定由逗号分隔的一个或多个start..stop
位置的序列。
此外,我还有一个位置数据集(显示了一部分):
structure(list(VarID = 1:9, chr = c(19L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 10L), position = c(58864801, 58863673, 58863673, 58863673,
58863673, 58863673, 58863673, 58863041, 52569689)), class = "data.frame", row.names = c(NA,
-9L))
我想在第二个数据集后面附加一列,以指定domain
所属的VarID
。
我想要的输出是:
structure(list(VarID = 1:9, chr = c(19L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 10L), position = c(58864801, 58863673, 58863673,
58863673, 58863673, 58863673, 58863673, 58863041, 52569689),
domain = c("A1BG_-_-_4", "A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2",
"A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2", "A1CF_-_-_0"
)), row.names = c(NA, -9L), class = "data.frame")
特别是,我无法使gsub
正常工作,这最终使我可以查询职位是否在start..stop
范围内。
答案 0 :(得分:0)
尝试foverlaps
中的data.table
:
library(data.table)
dtt.domain <- fread(text = ' domain chr positions length
A1BG_-_-_0 19 (58858387..58858395,58858718..58858719) 11
A1BG_-_-_1 19 (58858998..58859006,58861735..58862017,58862756..58862766) 303
A1BG_-_-_2 19 (58863018..58863053,58863648..58863673) 62
A1BG_-_-_3 19 (58863913..58863921,58864293..58864303) 20
A1BG_-_-_4 19 (58864552..58864563,58864657..58864693,58864769..58864803) 84
A1BG_143228_143228_0 19 (58858719..58858998) 280
A1BG_143228_143228_1 19 (58862766..58863018) 253
A1BG_143228_143228_2 19 (58863673..58863913) 241
A1BG_143228_143228_3 19 (58864303..58864552) 250
A1CF_-_-_0 10 (52566488..52566640,52569653..52569717) 218')
dtt.var <- fread(text = 'VarID chr position
1 19 58864801
2 19 58863673
3 19 58863673
4 19 58863673
5 19 58863673
6 19 58863673
7 19 58863673
8 19 58863041
9 10 52569689')
dtt.domain2 <- dtt.domain[, .(
region = strsplit(
gsub('\\(|\\)', '', positions), ',', fixed = TRUE)[[1]]),
by = .(domain, chr)]
dtt.domain2[, c('start', 'end') := tstrsplit(
region, '..', fixed = TRUE, type.convert = TRUE)]
setkeyv(dtt.domain2, c('chr', 'start', 'end'))
dtt.var[, `:=`(start = position, end = position)]
res <- foverlaps(dtt.var, dtt.domain2, mult = 'first')
res[, .(VarID, chr, position, domain)]
# VarID chr position domain
# 1: 1 19 58864801 A1BG_-_-_4
# 2: 2 19 58863673 A1BG_-_-_2
# 3: 3 19 58863673 A1BG_-_-_2
# 4: 4 19 58863673 A1BG_-_-_2
# 5: 5 19 58863673 A1BG_-_-_2
# 6: 6 19 58863673 A1BG_-_-_2
# 7: 7 19 58863673 A1BG_-_-_2
# 8: 8 19 58863041 A1BG_-_-_2
# 9: 9 10 52569689 A1CF_-_-_0
答案 1 :(得分:0)
另一个使用dplyr
和tidyr
的选项
library(dplyr)
library(tidyr)
df %>%
mutate(positions = gsub("[()]", "", positions),
chr = as.integer(chr)) %>%
separate_rows(positions, sep = ",") %>%
separate(positions, c("start", "end"), sep = "\\.\\.") %>%
left_join(locations, by = c("chr" = "chrno")) %>%
filter(loc > start & loc <= end) %>%
arrange(VarID) %>%
dplyr::select(VarID, chr, loc, domain)
# VarID chr loc domain
#1 1 19 58864801 A1BG_-_-_4
#2 2 19 58863673 A1BG_-_-_2
#3 3 19 58863673 A1BG_-_-_2
#4 4 19 58863673 A1BG_-_-_2
#5 5 19 58863673 A1BG_-_-_2
#6 6 19 58863673 A1BG_-_-_2
#7 7 19 58863673 A1BG_-_-_2
#8 8 19 58863041 A1BG_-_-_2
#9 9 10 52569689 A1CF_-_-_0
答案 2 :(得分:0)
{{1}}