我使用下面的脚本来配对它,但是它给出了错误。任何人都可以帮我配对我的文件吗?我的文件看起来像这样:
@MexD1ASRR1561197.13.1/1
TCAAAAGGAGAACTCAATAGGCTGAACAAGTTATCTTCTGGGATTGTAATGAGAGTTGCTTCACTGCTTTGGAAGAAGAAAGCTCAT
+SRR1561197.13.1/1
JJJJJIJJIIJJIJJJIJIIJJJJJJJJJJIIIIIJJJJJJJHIJJIGJJJJJJJGIJJJJJGIIHHHHHFFEFFDEEDEDCACDDD
@MexD1ASRR1561197.17.1/1
TATACAAAGCTGTCAACTTGATCTTCATACTTCTCATAAAGGACTGGTAATGTGTGGGCAGCAACGAAACCAACATATAAAACAGTC
+SRR1561197.17.1/1
HHGJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJJJJJJJJIJJJJJJIJJJJIIIIIIJIIHCHHFFFDDEDDDDDDEEEDCDDCCA
@MexD1ASRR1561197.19.1/1
GATCAACAGTACTGGAATGGCCATCCATCACAAGTTCAGCTAAAGCAGCTCCTGTTGCAGGACCGTTTAGAATACCCCAGCAACTGT
+SRR1561197.19.1/1
IJJJJJJJJHIIJJJIIIIJIJJJJJJJJJJJJJIIIJIJIIJJGHIIFIJJIHHHGHHFFBDCD>BDBDCCDCCACBD??CDBCCC
@MexD1ASRR1561197.23.1/1
CAATCTTAAATAAACTGCCCGTTGCTCAAGGGCATTCCTTCTCATACACATCATTGGCATACTTCCAGTTGATCACTTCCCAAATGT
+SRR1561197.23.1/1
JJJJIJJIJJJIJJJIHJJJJJJJJJJJJJJJJJJJIJJJJJJJJIJGIGIIIIJIGHHHHFFFFFFF>CECEEEDDDDDDDCBCCD
@MexD1ASRR1561197.24.1/1
AATTGAAGCTTGAGACTTTGATGGGTCAATGAGAAACCAAATTTTAGTGTATATATTGTGAGGTTTCTGATGTTTTATGGCATATAT
+SRR1561197.24.1/1
JJJJJJJJJJIJJJJJJJJJIJJJJHHJJJJJJJJJJJJJIJJJIIJGHFHHJJJJJJIIGHH?AEFFFFFFFEEEEECCBDCDDEF
@MexD1ASRR1561197.32.1/1
TTGGCTTCTATCTTCTTCTTGTGCTCCTCATCCTCAGACTTGTACTTCTCTGCCTCCTGAACCATCTTTTCAATCTCGTCCTTTGAC
+SRR1561197.32.1/1
JJJJJJJJJJJIJJJJJJJJJJJJJJJJJJJJJJJIJJJJJIHIIJJJJJJIJJJJJJIHIJHIJHHHHHHHFFFFFFDDDBCCAAC
@MexD1ASRR1561197.34.1/1
CCTGTTTGATGCGATCCATTTCATCCTTCACCTGCTTCTTCTTTGCTTTTCTCACAACAGGTTGCACTTTATTACATGCCATTTTAT
+SRR1561197.34.1/1
JJJJJIJJIJJJJJJJJJJJJJJJJJJJJJJJIJJJJJJJJJJJIIJIJJIJIGIIJJIGI==CEHHFFFFFFDEEECEEDDDEDFD
@MexD1ASRR1561197.36.1/1
TGAATCAAAAAGGTCTAACAATCTGAGAACAAAAGAGTGATCAACATACCTCTTAGCCAATTTTGCATCTGTCTCTGGTGATGCCAC
+SRR1561197.36.1/1
JJJJJJJJJJJJJHHIJJJJJJJJJJJJJJJJIJJJJEHIJJJJJJJJJJJJJJIJJJJIJHEHHHHFFFFFFFEEEDEEDDDDDDD
@MexD1ASRR1561197.38.1/1
AACCATGCTCTTTACTCTTATTCACGCAAGTCAATTTAGCCTCCCCACTTAGCATAAGATCCACAAACCACCACCCACCAACCATAT
+SRR1561197.38.1/1
JJJJJJIJJJJJJJJIJJJJJJJJJIJIJJHIJIJIIJJJJIJJJJJIJJIIJJHHHHHHFFCEFDEDDD@DDDDDDDDBDDD?CCD
@MexD1ASRR1561197.39.1/1
GGCCGCATCTGCATCCTCTGTGCCAGCAACTGCTGATGAGCCAGACGTGTCATGGGTTCAGTCCTTGGTGAAGGATACCCCTACTGT
+SRR1561197.39.1/1
JJJJJJJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJJJJJJIJEHHHGFFDEFEEECCEDCCCDDDDDDCDDCCDDDDCDD
@MexD1ASRR1561197.41.1/1
TGGCAGAAAAGTTGATCTCACTGCCTATGACAGCTATGAAAAACTCTCAACTGCTGTTGATGAACTCTTCAGAGGCCTTCTTGCAGC
+SRR1561197.41.1/1
JJJJJJJJIJJIJJJJJJJIJJJIJJJJJGIJIJJIIJIJJJJJJJJIEHIJIJJJJJJIIJHHHHHHHFFFFFFDDEEDDDDDDDD
我的代码
#!/usr/bin/perl
# DESCRIPTION:
# This script is designed to compare two fastq files.
# It produces 4 output files
# - 2 files that is a list of common sequences of both files
# - 2 files that are unique to each file
############################################### ###############################
use strict;
use warnings;
use Getopt::Std;
use File::Basename;
## CONSTANTS ##
my $TRUE = 1;
my $FALSE = 0;
my $DEBUG = $FALSE;
my $EXITSTATUS = 0;
# Default umask
umask 027;
# Define variables
# Get the options passed at the command prompt
GetOptions();
##############################
# M A I N P R O G R A M #
##############################
# Check to see we received two files in the arguments
if(scalar(@ARGV) != 2)
{
print STDERR "Incorrect number of arguments\n";
Usage();
exit(1);
}
my $fail = $FALSE;
# Check to see if the files exist
foreach my $file (@ARGV)
{
if(!-e $file)
{
print STDERR "File $file didn't exist\n";
$fail = $TRUE;
}
}
# If any of the files didn't exist, let's kill it
if($fail)
{
exit(1);
}
# Read the file names in from the command line
my $file1 = shift(@ARGV);
my $file2 = shift(@ARGV);
# Index the first file.
my %fastqIndex1 = %{IndexFastq($file1)};
# Compare the two files
CompareFastq($file1, $file2, \%fastqIndex1);
exit($EXITSTATUS);
# Subroutines
sub Usage
{
my $base = basename($0);
print "Usage: $base [dh] file1 file2\n";
print "\td:\tDebug mode on (default off)\n";
print "\th:\tPrint this usage\n";
}
sub GetOptions
{
# Get the options passed at the command prompt
my %options=();
getopts("dh", \%options);
if(defined($options{'d'}))
{
$DEBUG = $TRUE;
}
if(defined($options{'h'}))
{
Usage();
exit($EXITSTATUS);
}
}
sub IndexFastq
{
my $file = shift;
my %fastqIndex;
open(IN, $file) or die("Could not open $file\n");
my $pos = tell(IN);
my $lineCounter = 1;
while(my $line = <IN>)
{
chomp($line);
# Each block is going to be of 4 lines
# Let's get the seq ID from the sequence name
if($line =~ m/^@(.*)/)
{
$fastqIndex{$1} = $pos;
# Skip the next 3 lines
for(my $i=0; $i<3; $i++)
{
<IN>;
$lineCounter++;
}
}
elsif($line =~ m/^#/)
{
print STDERR "File: $file\[$lineCounter]: Skipping comment line: $line\n" if($DEBUG);
}
elsif($line =~ m/^$/)
{
print STDERR "File: $file\[$lineCounter]: Skipping empty line: $line\n" if($DEBUG);
}
else
{
print STDERR "File: $file\[$lineCounter]: Could not match the sequence ID from the name: $line\n" if($DEBUG);
}
$pos = tell(IN);
$lineCounter++;
}
close(IN);
return \%fastqIndex;
}
sub CompareFastq
{
my $file1 = shift;
my $file2 = shift;
my $fastqIndex1Ref = shift;
my %fastqIndex1 = %{$fastqIndex1Ref};
my %found1;
# We don't want to have to open/close file handles each time, so let's open them here
open(F1COUT, ">$file1-common.out") or die("Could not write to file: $file1-common.out\n");
open(F2COUT, ">$file2-common.out") or die("Could not write to file: $file2-common.out\n");
open(F1UOUT, ">$file1-unique.out") or die("Could not write to file: $file1-unique.out\n");
open(F2UOUT, ">$file2-unique.out") or die("Could not write to file: $file2-unique.out\n");
open(F1IN, $file1) or die("Could not open $file1\n");
open(F2IN, $file2) or die("Could not open $file2\n");
while(my $line = <F2IN>)
{
chomp($line);
# Skip empty lines or comments
if($line =~ m/^$/g or $line =~ m/^\s*#/)
{
next;
}
# Each block is going to be of 4 lines
# Let's get the seq ID from the sequence name
if($line =~ m/^@(.*)/)
{
my $seqId = $1;
if(defined($fastqIndex1{$seqId}))
{
$found1{$seqId} = $TRUE;
# Print out from file1
seek(F1IN, $fastqIndex1{$seqId}, 0);
for(my $i=0;$i<4;$i++)
{
my $tmpLine = <F1IN>;
print F1COUT $tmpLine;
}
# Print out from file 2
print F2COUT $line . "\n";
for(my $i=0; $i<3; $i++)
{
my $tmpLine = <F2IN>;
print F2COUT $tmpLine;
}
}
else
{
# Print out from file 2
print F2UOUT $line . "\n";
for(my $i=0; $i<3; $i++)
{
my $tmpLine = <F2IN>;
print F2UOUT $tmpLine;
}
}
}
else
{
print STDERR "Could not match the sequence ID from the name: $line\n";;
next;
}
}
close(F1COUT);
close(F2COUT);
close(F2UOUT);
close(F2IN);
# Now let's worry about the sequences that weren't common in file 1
# File 1
if(keys(%fastqIndex1) != keys(%found1))
{
foreach my $seqId (keys %fastqIndex1)
{
if(!defined($found1{$seqId}))
{
seek(F1IN, $fastqIndex1{$seqId}, 0);
for(my $i=0;$i<4;$i++)
{
my $tmpLine = <F1IN>;
print F1UOUT $tmpLine;
}
}
}
}
close(F1UOUT);
close(F1IN)
}
答案 0 :(得分:1)
您可以使用Pairfq执行此任务。如果您修剪了名为seqs_1_trim.fq
和seqs_2_trim.fq
的文件,则用法为:
curl -sL git.io/pairfq_lite | perl - makepairs -f seqs_1_trim.fq \
-r seqs_2_trim.fq \
-fp seqs_1_trim_p.fq \
-rp seqs_2_trim_p.fq \
-fs seqs_1_trim_s.fq \
-rs seqs_2_trim_s.fq
有关该命令的wiki的更多信息。顺便说一句,如果你有很多读取而没有太多内存,我建议安装程序并使用索引方法。添加--stats
选项将打印一些有关文件的有用信息,如该命令的Wiki所示。