我想在蛋白质序列中每10个字符插入一个换行符:
seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"
在Perl中,它非常简单:
$seq=~s/(.{10})/$1\n/g ; # does the job!
perl -e '$seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"; $seq=~s/(.{10})/$1\n/g; print $seq'
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
朱莉娅,
replace(seq, r"(.{10})" , "\n")
不起作用,因为我不知道如何获取捕获组(。{10})并将其替换为自己+“\ n”
julia> replace(seq, r"(.{10})" , "\n")
"\n\n\n\n\n\n"
为此,我需要两个步骤:
julia> a=matchall(r"(.{1,10})" ,seq)
6-element Array{SubString{UTF8String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"
"RFFPYIALFQ"
julia> b=join(a, "\n")
"MSKNKSPLLN\nESEKMMSEML\nPMKVSQSKLN\nYEEKVYIPTT\nIRNRKQHCFR\nRFFPYIALFQ"
julia> println(b)
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
# Caution :
a=matchall(r"(.{10})" ,seq) # wrong if seq is not exactly a multiple of 10 !
julia> seq
"MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIAL"
julia> matchall(r"(.{10})" ,seq)
5-element Array{SubString{UTF8String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"
julia> matchall(r"(.{1,10})" ,seq)
6-element Array{SubString{UTF8String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"
"RFFPYIAL"
是采用一步解决方案还是更好(更快)的方式?
只是为了获得所有这些有趣答案的基准! (用朱莉娅5.0更新)
function loop(a)
last = 0
#create the interval, in your case 10
salt = 10
#iterate in string (starts in the 10th value, don't forget julia use 1 to first index)
for i in salt:salt+1:length(a)
# replace the string for a new one with '\n'
a = string(a[1:i], '\n', a[i+1:length(a)])
last = Int64(i)
end
# replace the rest
a = string(a[1:length(a) - last % salt + 1], '\n', a[length(a) - last % salt + 2:length(a)])
println(a)
end
function regex1(seq)
a=matchall(r"(.{1,10})" ,seq)
b=join(a, "\n")
println(b)
end
function regex2(seq)
a=join(split(replace(seq, r"(.{10})", s"\1 ")), "\n")
println(a)
end
function regex3(seq)
a=replace(seq, r"(.{10})", Base.SubstitutionString("\\1\n"))
a= chomp(a) # because there is a new line at the end
println(a)
end
function intrapad(seq::String)
buf = IOBuffer((length(seq)*11)>>3) # big enough buffer
for i=1:10:length(seq)
write(buf,SubString(seq,i,i+9),'\n')
end
#return
print(takebuf_string(buf))
end
function join_substring(seq)
a=join((SubString(seq,i,i+9) for i=1:10:length(seq)),'\n')
println(a)
end
seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"
for i = 1:5
println("loop :")
@time loop(seq)
println("regex1 :")
@time regex1(seq)
println("regex2 :")
@time regex2(seq)
println("regex3 :")
@time regex3(seq)
println("intrapad :")
@time intrapad(seq)
println("join substring :")
@time join_substring(seq)
end
我将基准测试更改为@time执行5次,并在@time执行5次后在此处发布结果:
loop :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIA
LFQ
0.000013 seconds (53 allocations: 3.359 KB)
regex1 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000013 seconds (49 allocations: 1.344 KB)
regex2 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000017 seconds (47 allocations: 1.703 KB)
regex3 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000013 seconds (31 allocations: 976 bytes)
intrapad :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000007 seconds (9 allocations: 608 bytes)
join substring :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000012 seconds (21 allocations: 800 bytes)
Intrapad现在是第一个;)
答案 0 :(得分:10)
与@daycaster建议一样,您可以使用s"\1"
作为替换字符串来支持捕获组。问题是特殊的s""
字符串语法不支持\n
之类的特殊字符。你可以通过手动构建SubstitutionString
对象来解决这个问题,但是你需要转义\
中的\1
:
julia> replace(seq, r"(.{10})", Base.SubstitutionString("\\1\n"))
"MSKNKSPLLN\nESEKMMSEML\nPMKVSQSKLN\nYEEKVYIPTT\nIRNRKQHCFR\nRFFPYIALFQ\n"
答案 1 :(得分:6)
如果速度是一个问题,最好避免使用较重的工具,例如正则表达式,并尽量让工作低级完成,如下所示:
function intrapad(seq::String)
buf = IOBuffer((length(seq)*11)>>3) # big enough buffer
for i=1:10:length(seq)
write(buf,SubString(seq,i,i+9),'\n')
end
return takebuf_string(buf)
end
速度来自于使用IOBuffer和SubStrings最小化分配。使用BenchmarkTools包我们有:
julia> @benchmark intrapad(seq)
BenchmarkTools.Trial:
memory estimate: 624.00 bytes
allocs estimate: 10
minimum time: 729.00 ns (0.00% GC)
median time: 767.00 ns (0.00% GC)
mean time: 862.99 ns (7.84% GC)
maximum time: 26.86 μs (96.21% GC)
julia> @benchmark replace(seq, r"(.{10})", Base.SubstitutionString("\\1\n"))
BenchmarkTools.Trial:
memory estimate: 720.00 bytes
allocs estimate: 26
minimum time: 2.18 μs (0.00% GC)
median time: 2.29 μs (0.00% GC)
mean time: 2.43 μs (3.85% GC)
maximum time: 531.31 μs (98.95% GC)
仅加速2.5倍。 replace
功能已得到很好的实施!
没有正则表达式的另一种方法是
join((SubString(seq,i,i+9) for i=1:10:length(seq)),'\n')
速度不快(速度慢10倍,我的机器没有内存分配代价),但非常易读。
答案 2 :(得分:3)
类似的东西:
julia> split(replace(seq, r"(.{10})", s"\1 "))
6-element Array{SubString{String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"
"RFFPYIALFQ"
如果您想将其作为字符串,请使用join()
:
julia> join(split(replace(seq, r"(.{10})", s"\1 ")), "\n")
"MSKNKSPLLN\nESEKMMSEML\nPMKVSQSKLN\nYEEKVYIPTT\nIRNRKQHCFR\nRFFPYIALFQ"
julia> println(ans)
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
答案 3 :(得分:3)
我不知道如何使用REGEX,但我认为它可以解决您的问题:
a = "oiaoueaoeuaoeuaoeuaoeuaoteuhasonetuhaonetuahounsaothunsaotuaosu"
last = 0
#create the interval, in your case 10
salt = 10
#iterate in string (starts in the 10th value, don't forget julia use 1 to first index)
for i in salt:salt+1:length(a)
# replace the string for a new one with '\n'
a = string(a[1:i], '\n', a[i+1:length(a)])
last = Int64(i)
end
# replace the rest
a = string(a[1:length(a) - last % salt + 1], '\n', a[length(a) - last % salt + 2:length(a)])
println(a)