我有一个包含遗传信息的数据集。
sub()
数据如下所示:
正如你所看到的,有两行,在第一行中,我有一个我想要修改的基因序列(字符序列)。该字符串是从较长的DNA序列中分组的(第一个序列的原始长度是16983252)。
SNP.position正在给我原始字符串中的字符位置。 SNP.pos.in.subset给了我相同的位置但仅在子集内(就像我在我的子集中以1开始计数一样)。因此对于第一序列:5480045和5480464指的是子集序列中的201ad219。
我想在201和219位置放置一些大括号,以便轻松找到这些位置的字符。
我创建了一个脚本来执行此操作。
structure(list(GenBank.Accession.version = structure(1:2, .Label = c("JH739893",
"JH751134"), class = "factor"), set = c(17L, 116L), snp.po.200.low = c(5480045,
-102), snp.po.200.up = c(5480464, 340), SNP.position = list(c(5480245L,
5480263L), c(98L, 139L)), seq2 = c("TTACATGGCAAGCACTCAATCTGGCTGCAGGGTGTCTGGCCACATACAAAACAAATGCCAAGTCACCTCTTGTCCCAAGGATCAAGACAAATTTGGACAACAAACCACACTGGCAGCCCCCTAGAAGCTTTCAGATATTTTAATGCCATTGAGATGTAGCATCCAGTGTAGACATTATTAGAAGCACAGCAGTTGCACTCGCACCTCCAGGGTGTCCAACATATGCTGGATTCTGGCATTGCTCATGGCAAGTGAGTTGGTGAATTCACAACTAGCCAGGTCATGTCTTCATTGCAGCAGAAAACTCATCAGCATGTCAGGATGAGAAAAGTCAATACAAAGGAAATGTGGGGATGGGATGGGATGGGATGGGATGGGATGGGATGGGATGGGATGGGATGGGATGGGATAGGGGGGTAA",
"AAAAAAAAAAAAAGAAAAGGGAATTTAAGGAGTCCCAGAGACAGGAGAATTCAGGACAATTTGCACCAATCACTTGCTCCTGGAAAGGAAGGTTGGGCTGATTTGGGGTTGGTAAGCACAGACCTTTCATCCGTTCGTAGAAAGAAGGAAAATTAAATCTCATGGCCTGTTTGTGAAAGGAAATTGCCCAGAATAGCTCTGACAGAATAAGCTATTCCACAATAGCTCCCCATGCGGACACTCCAGCCACTTTGTTCCAGGCTAATTAGTGTGCTTCCAAGCGCAGTAATTATCCTGGAAGGGAAATCTCTCCTCTCCCACAAAGAGTGTTTGCATGGAG"
), seq.length = c(16983252L, 753L), pos.list = list(5480045:5480464,
1:340), SNP.pos.in.subset = list(c(201L, 219L), c(98L, 139L
))), .Names = c("GenBank.Accession.version", "set", "snp.po.200.low",
"snp.po.200.up", "SNP.position", "seq2", "seq.length", "pos.list",
"SNP.pos.in.subset"), row.names = c(17L, 116L), class = "data.frame")
但是这个脚本给我发了这个错误:
add.target.snp = function(sequences,
pos.start = 200,
pos.end.added = 3,
character.start = "{/",
character.end = "}") {
old = as.character(sequences)
for(i in 1:length(old)){
up.else = SNP.position[i]+pos.end.added
old[i] = gsub(paste0('^(.{',pos.start,'})(.*)$'), paste0('\\1',character.start,'\\2'), old[i])
old[i] = gsub(paste0('^(.{',up.else, '})(.*)$'), paste0('\\1',character.end,'\\2'), old[i])
}
return(old)
}
output.target = add.target.snp(sequences = df$seq2,
pos.start = df$SNP.pos.in.subset,
pos.end.added = 3,
character.start = "{/",
character.end = "}")
有没有办法可以运行我的脚本,但是包含多个值来围绕“{/ my_value_at_position_201}”和“{/ my_value_at_position_219}”?
最终结果(对于我显示的数据中的第二行)应为
Error in gsub(paste0("^(.{", pos.start, "})(.*)$"), paste0("\\1", character.start, :
invalid regular expression '^(.{c(201, 219)})(.*)$', reason 'Invalid contents of {}' In addition: Warning message:
In gsub(paste0("^(.{", pos.start, "})(.*)$"), paste0("\\1", character.start, :
argument 'pattern' has length > 1 and only the first element will be used
我的脚本的另一个问题是,如果我在向量中添加一些字符(在我的情况下为3个字符:“{/}”),它将移动第二个数字的位置(201,219 + 3) ...有没有办法一次添加括号,以便数字不会改变?
答案 0 :(得分:4)
正则表达式是错误的工具。您想使用子串替换。基础substr
不允许您替换零长度字符串,但类似的应该起作用:
library(stringi)
library(purrr)
add_bits <- function(sequences,
pos.start = 200,
pos.end.added = 3,
character.start = "{/",
character.end = "}"
) {
# this row allows for the fact that your string is growing.
pos.start <- pos.start + c(0, cumsum(rep(nchar(character.start) +
nchar(character.end), length(pos.start) -1)))
for (ps in pos.start) {
stringi::stri_sub(sequences, ps, length = 0) <- character.start
stringi::stri_sub(sequences, ps + pos.end.added, length = 0) <- character.end
}
sequences
}
tmp <- c("abcde", "123456789")
purrr::map2(tmp, list(c(2,5), 3), add_bits)
## [[1]]
## [1] "a{/b}cd{/e}fg"
##
## [[2]]
## [1] "12{/3}4567"
答案 1 :(得分:1)
这是我尝试使用基础包:
add.target.snp = function(sequences, pos.start = NA,
character.start = "{/", character.end = "}"){
# check input
pos.start <- sort(pos.start[ pos.start <= nchar(sequences)])
# split on SNP positions
snps <- substring(
sequences, c(1, pos.start), c(pos.start - 1, nchar(sequences)))
# exclude "" SNP strings
snps <- snps[ snps != "" ]
# take 1st char and wrap, then paste the rest as is
x0 <- ""
if(!1 %in% pos.start){
x0 <- snps[1]
snps <- snps[2:length(snps)]}
res <- sapply(snps, function(snp){
x1 <- substr(snp, 1, 1)
x2 <- substr(snp, 2, max(2, nchar(snp)))
paste0(paste0(character.start, x1, character.end), x2)})
# return
paste(c(x0, res), collapse = "")
}
tmp <- c("abcde", "123456789")
purrr::map2(tmp, list(c(2,5), 3), add.target.snp)
# [[1]]
# [1] "a{/b}cd{/e}"
#
# [[2]]
# [1] "12{/3}456789"