我有一个600 Mb FASTA文件,其中包含12种物种的许多对齐块,我想将它们拆分为较小的FASTA文件,其中包含一个块,每个块都有相应的对齐
我有一个看起来像这样的sed脚本:
#!/bin/bash
echo
for i in {0..Nblocks}; do
sed -n "/block_index=$i|/,/^$/p" genome12species.fasta > bloque$i.fasta
done
这适用于小规模,但对于600Mb的大文件,它需要太长时间,大约2天。我不认为这是我正在运行的计算机的问题。
有谁知道如何加快速度?
输入的Fasta文件如下所示:
dm3.chr3R( - ):17092630-17092781 | sequence_index = 0 | block_index = 4 | =物种DM3 | dm3_4_0 GGCGGAGATCAAGAATCGCGTCGGGCCGCCGTCCAGCGCCACTGACAACGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAACACCAAATCCGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattatacaatta droGri2.scaffold_15074( - ):2610183-2610334 | sequence_index = 0 | block_index = 4个| =物种droGri2 | droGri2_4_0 GGCGGAGATCAAGAATCGTGTTGGGCCGCCGTCGAGCGCCACCGATAACGCTAGCAAAGTGAAAATCGATCAGGGACGCCCAGTGGAAAACAATAGATCTGGTTGCTGCTAAATAA-CTCTGATTGTGAATCATTATTTTATTATACAATTa droMoj3.scaffold_6540(+):33866311-33866462 | sequence_index = 0 | block_index = 4个| =物种droMoj3 | droMoj3_4_0 TGCCGAGATTAAGAATCGTGTCGGTCCGCCGTCCAGCGCAACCGACAATGCAAGCAAAGTGAAAATCGATCAGGGACGTCCAGTGGAGAACACCAGATCTGGTTGCTGCTGAATAA-CTCTGATTGTGAATCATTATTTTATTatacaatta droVir3.scaffold_12822(+):1248119-1248270 | sequence_index = 0 | block_index = 4 | =物种droVir3 | droVir3_4_0 GGCCGAGATTAAGAATCGCGTCGGGCCGCCGTCCAGCGCCACCGATAATGCTAGCAAAGTGAAAATCGATCAGGGTCGTCCAGTGGAGAACACCAAATCTGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattatacaatta droWil1.scaffold_181130( - ):16071336-16071488 | sequence_index = 0 | block_index = 4个| =物种droWil1 | droWil1_4_0 GGCCGAGATTAAGAATCGTGTTGGGCCGCCGTCCAGCGCCACTGATAATGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAATACCAAATCCGGTTGCTGCTGAATAAACTCTGATTGTGAATCATTATTTTATTATACAATTA droPer1.super_19( - ):1310088-1310239 | sequence_index = 0 | block_index = 4 | =物种droPer1 | droPer1_4_0 GGCTGAGATCAAGAATCGCGTCGGACCGCCGTCCAGCGCCACCGACAACGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAAACCCAATTCTGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattatacaatta dp4.chr2( - ):5593491-5593642 | sequence_index = 0 | block_index = 4 | =物种DP4 | dp4_4_0 GGCTGAGATCAAGAATCGCGTCGGACCGCCGTCCAGCGCCACCGACAACGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAAGCCCAATTCTGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattatacaatta droAna3.scaffold_13340( - ):3754154-3754305 | sequence_index = 0 | block_index = 4 | =物种droAna3 | droAna3_4_0 GGCCGAGATCAAGAATCGCGTCGGGCCACCGTCCAGCGCCACCGACAACGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAACACCAGATCCGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattataaaatta droEre2.scaffold_4770(+):4567591-4567742 | sequence_index = 0 | block_index = 4 | =物种droEre2 | droEre2_4_0 GGCCGAGATCAAGAATCGCGTCGGGCCGCCGTCCAGCGCCACCGACAACGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAACACCAAATCCGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattatacaatta droYak2.chr3R( - ):5883047-5883198 | sequence_index = 0 | block_index = 4个| =物种droYak2 | droYak2_4_0 GGCCGAGATCAAGAATCGCGTCGGGCCGCCATCCAGCGCCACCGACAACGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAACACCAAATCCGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattatacaatta droSec1.super_38(+):36432-36583 | sequence_index = 0 | block_index = 4个| =物种droSec1 | droSec1_4_0 GGCGGAGATCAAGAATCGCGTCGGTCCGCCGTCCAGCGCCACTGACAACGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAACACCAAATCCGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattatacaatta droSim1.chr3R(+):4366350-4366501 | sequence_index = 0 | block_index = 4 | =物种droSim1 | droSim1_4_0 GGCGGAGATCAAGAATCGCGTCGGGCCGCCGTCCAGCGCCACTGACAACGCTAGCAAAGTGAAAATCGATCAAGGACGTCCAGTGGAAAACACCAAATCCGGTTGCTGCTGAATAA-CTCTGATTGTGAATCattattttattatacaatta
dm3.chr3R( - ):17092781-17092867 | sequence_index = 0 | block_index = 5 |物种=分米3 | dm3_5_0 GAGTACGCCGCCCAGTTAGGCATTCCATTCCTTGAAACTTCGGCCAAGAGCGCCACCAACGTTGAGCAGGCCTTCATGACGATGGC droSim1.chr3R(+):4366264-4366350 | sequence_index = 0 | block_index = 5 |物种= droSim1 | droSim1_5_0 GAGTACGCCGCCCAGTTAGGCATTCCATTCCTTGAAACTTCGGCCAAGAGCGCCACCAACGTTGAGCAGGCCTTTATGACGATGGC droSec1.super_38(+):36346-36432 | sequence_index = 0 | block_index = 5个|物种= droSec1 | droSec1_5_0 GAGTACGCCGCCCAGTTAGGCATTCCATTCCTTGAAACTTCGGCCAAGAGCGCCACCAACGTTGAGCAGGCCTTCATGACGATGGC droYak2.chr3R( - ):5883198-5883284 | sequence_index = 0 | block_index = 5个|物种= droYak2 | droYak2_5_0 GAGTACGCCGCCCAGTTAGGCATTCCATTCCTTGAAACATCGGCCAAGAGCGCCACCAACGTGGAGCAGGCCTTCATGACGATGGC droEre2.scaffold_4770(+):4567505-4567591 | sequence_index = 0 | block_index = 5 |物种= droEre2 | droEre2_5_0 GAGTACGCCGCCCAGTTAGGCATTCCATTCCTTGAAACTTCGGCCAAGAGCGCCACCAACGTGGAGCAGGCCTTCATGACGATGGC droAna3.scaffold_13340(+):20375068-20375148 | sequence_index = 0 | block_index = 5 |物种= droAna3 | droAna3_5_0 ------ GCCGAAAACTTCGACATGCCCTTCTTCGAGGTCTCTTGCAAGTCAAACATCAATATTGAAGATGCGTTTCTTTCCCTGGC dp4.chr2( - ):5593642-5593728 | sequence_index = 0 | block_index = 5 |物种= DP4 | dp4_5_0 GAGTATGCAGCTCAGTTAGGCATTCCATTTCTTGAAACTTCGGCCAAGAGCGCCACGAACGTGGAGCAGGCCTTCATGACGATGGC droPer1.super_19( - ):1310239-1310325 | sequence_index = 0 | block_index = 5 |物种= droPer1 | droPer1_5_0 GAGTATGCAGCTCAGTTAGGCATTCCATTTCTTGAAACTTCGGCCAAGAGCGCCACGAACGTGGAGCAGGCCTTCATGACGATGGC droWil1.scaffold_181130( - ):16071488-16071574 | sequence_index = 0 | block_index = 5个|物种= droWil1 | droWil1_5_0 GAATATGCGGCTCAGTTAGGCATTCCATTCCTTGAAACTTCGGCAAAGAGTGCCACCAATGTGGAGCAGGCCTTTATGACGATGGC droVir3.scaffold_12822(+):1248033-1248119 | sequence_index = 0 | block_index = 5 |物种= droVir3 | droVir3_5_0 GAGTACGCACATCAGTTAGGCATTCCATTCCTTGAAACTTCGGCCAAGAGCGCCACCAACGTGGAGCAGGCATTTATGACGATGGC droMoj3.scaffold_6540(+):33866225-33866311 | sequence_index = 0 | block_index = 5个|物种= droMoj3 | droMoj3_5_0 GAGTATGCACATCAGTTAGGCATTCCATTCCTTGAAACTTCGGCCAAGAGCGCCACCAATGTAGAGCAGGCATTCATGACGATGGC droGri2.scaffold_15074( - ):2610334-2610420 | sequence_index = 0 | block_index = 5个|物种= droGri2 | droGri2_5_0 GAGTACGCAAATCAGTTAGGCATTCCATTCCTTGAAACTTCGGCGAAGAGTGCCACCAATGTGGAACAGGCATTCATGACGATGGC
答案 0 :(得分:1)
这是一个让你开始的awk oneliner - 它使用与你的sed相同的正则表达式范围 - 匹配的block_index是m [1] - 600MB应该只需几分钟
awk 'match($0, /block_index=([0-9]+)\|/, m),/^$/ {print >"bloque"m[1]".fasta"}'