我有一个fasta文件。我需要删除含有“N”的序列或不含至少3个独特碱基的序列。 到目前为止的代码如下。另外,我将如何删除序列ID行作为我删除的序列。
#!/usr/bin/perl
use strict;
use warnings;
open FILE, '<', $ARGV[0] or die qq{Failed to open "$ARGV[1]" for input: $!\n};
open match_fh, ">$ARGV[0]_trimmed.fasta"
or die qq{Failed to open for output: $!\n};
while ( my $line = <FILE> ) {
chomp($line);
if ( $line =~ m/^>/ ) {
print match_fh "$line\n";
my @data = split( /\|/, $line );
my $nextline = <FILE>;
if ( $nextline !~ /N+/g ) {
if ( $nextline =~ /[ATGC]{3}/g ) {
}
print match_fh "$nextline";
}
}
}
close FILE;
close match_fh;
INPUT
>seq1
ATGCGGGATGATCCGAACGTTTAATCTCGTATGCCGTCTTCTATCTCNNN
>seq2
GATGAGCTTGACTCTAGTCCATCTCGTATGCCGTCTTCTGCTATCTCGTA
>seq3
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTC
>seq4
TGGTACTGTAAGCATGAGAGTAATCTCGTATGCCGTCTTCTGCTTGAAAA
OUTPUT
>seq2
GATGAGCTTGACTCTAGTCCATCTCGTATGCCGTCTTCTGCTATCTCGTA
>seq4
TGGTACTGTAAGCATGAGAGTAATCTCGTATGCCGTCTTCTGCTTGAAAA
答案 0 :(得分:0)
while(my $head = <FILE>) {
next if($head !~ /^>/);
$_=<FILE>;
if(!/N+/ && /A/+/T/+/G/+/C/ >= 3) {
print match_fh $head, $_;
}
}