我运行了以下脚本:
#!/usr/bin/env python
#USAGE: python3 shuffle_pairs_fastq.py input_1.fastq.gz input2.fastq.gz > input12.fasta
from __future__ import print_function
import sys
import gzip
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def parse_file(file_name, read_set):
line_id = 0
name = ''
data = ''
qual = ''
valid = False
with (gzip.open(file_name)) as f:
for line in f:
if (line_id == 0):
if (valid):
if (len(name) == 0 or len(data) == 0 or len(data) != len(qual)):
eprint('File is not in FASTQ format')
sys.exit(1)
valid = False
if (name in read_set):
print(name + '2')
else:
read_set.add(name)
print(name + '1')
print(data)
name = line.rstrip().split(' ')[0].replace('@','>',1)
data = ''
qual = ''
line_id = 1
elif (line_id == 1):
if (line[0] == '+'):
line_id = 2
else:
data += line.rstrip()
elif (line_id == 2):
qual += line.rstrip()
if (len(qual) >= len(data)):
valid = True
line_id = 0
if (valid):
if (len(name) == 0 or len(data) == 0 or len(data) != len(qual)):
eprint(len(name), len(data), len(qual))
eprint('File is not in FASTQ format')
sys.exit(1)
if (name in read_set):
print(name + '2')
else:
read_set.add(name)
print(name + '1')
print(data)
if __name__ == '__main__':
read_set = set()
if (len(sys.argv) > 1):
parse_file(sys.argv[1], read_set)
if (len(sys.argv) > 2):
parse_file(sys.argv[2], read_set)
不幸的是,我遇到了以下错误:
> python3 shuffle_pair_end_reads.py R1.fq.gz R2.fq.gz
Traceback (most recent call last):
File "shuffle_pair_end_reads.py", line 65, in <module>
parse_file(sys.argv[1], read_set)
File "shuffle_pair_end_reads.py", line 33, in parse_file
name = line.rstrip().split(' ')[0].replace('@','>',1)
TypeError: 'str' does not support the buffer interface
第一个输入文件的内容:
zcat R1.fq.gz
@E00526:39:HNMN5CCXY:6:1101:3792:993 1:N:0:NTTCAGAA+NTTCGCCT
NTCATAACCATAGAATATGTAAATCTCTAAGTAGTAGTCTAAGACACCATATCCCTTTCGGGATCCGCCTATTCATTTATGCCCTACTACTTGAATACTTCTAATTATATTCCATTACTTTGCCTATCCAGTTCCGCTTGCATCGATATTT
+
#A<AFFFJJJ7FF7-A-J<FA-AF<FFFJA<JAF-A7-<-JFJJJ<<<JJJ--F7F<AJF-FFJFFFFF-J---7----<----A--77JFJA-----77-7---A----7----7--7--7----7---7---------7--7-------
@E00526:39:HNMN5CCXY:6:1101:5010:993 1:N:0:NTTCAGAA+NTTCGCCT
NACTCCTGTACATGGTCTAGATAGGGAGTATCTTGGAATTCTTGGTTGCCCATTATTCTCAAACCTGATGTATGAGTAAGTCCAACTCTCCGATCACTAGTTGTTCCTGAATATTCATGTAAACGGCCAGCTCGAGCCCTAAGATGCAGGT
+
#AAAF<FJJFFFJJJJJJFFJJJJJF7FAF<JFJJJA-FJFFF<FJAFFJJJJFAJJF-FJJJFJFJJFFJJFJF7F7-<---<FAFAJFFFA<F-<F--A--7A-77AJ7AFA<--7----7FF---77-7-7<-A7JJJ77-A---A7-
第二个输入文件的内容:
zcat R2.fq.gz
@E00526:39:HNMN5CCXY:6:1101:3792:993 2:N:0:NTTCAGAA+NTTCGCCT
NTGGCGCTATTATTATCTCTAAGGGAAGTTAAGGATGAACTAGAAGCAGATAGATTGGTTAGACATGGTTGAACCGGCATAGCGATGGCGATGCATGCGGGGCTCGATGGCCGTCGTAATTGATTCTATTTTGAAGTATTCAAGTATTAGG
+
#<<A<A---FAFA<-FA7FFJ7AJFJ<F-FF-F<<-F-<FAF<-FJ7--<A-<-7<FJ-7-F-A-<AAFJJJ<-JFA--FF<-77AFF7<-AFJAF<F7AAA--7-7A-F-AF77A7-7-<<-7F--7<<J-A-7-AAJF7A<-<-7<A-<
@E00526:39:HNMN5CCXY:6:1101:5010:993 2:N:0:NTTCAGAA+NTTCGCCT
NTTGGAGTATGTTTTTCGAAGGAGCCGCAAACTTCAAAGGAGTGAGTATAGGAGCGGTTTTGGTATCAGAAAATGGTTAGTATTAATCGCTGTCCGCCAAGCTCAGATTCCCCTGCACCAACAATATGGCCGAGTACGCAGCCTGCATCTT
+
#AAAAFJJFJJJJJJJJJ<<FJFF-FFJJJJJFJJA<AFF-FJFAJAFJFFJ<AJJJAAFJFFJFJ<<FF<F--J7AAAJJJFFJJJJF<AJFJJJJJF7AJFJJJ-FAJ7FJFJJ<FJ7<FFFAA<FFJJF-A)-7A)7<JFFF-7-A--
我想念什么?
先谢谢您