gzip文件导致TypeError:“ str”不支持缓冲区接口

时间:2019-03-15 01:59:23

标签: python

我运行了以下脚本:

#!/usr/bin/env python

#USAGE: python3 shuffle_pairs_fastq.py input_1.fastq.gz input2.fastq.gz > input12.fasta


from __future__ import print_function
import sys
import gzip

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

def parse_file(file_name, read_set):
    line_id = 0
    name = ''
    data = ''
    qual = ''
    valid = False
    with (gzip.open(file_name)) as f:
        for line in f:
            if (line_id == 0):
                if (valid):
                    if (len(name) == 0 or len(data) == 0 or len(data) != len(qual)):
                        eprint('File is not in FASTQ format')
                        sys.exit(1)
                    valid = False
                    if (name in read_set):
                        print(name + '2')
                    else:
                        read_set.add(name)
                        print(name + '1')
                    print(data)
                name = line.rstrip().split(' ')[0].replace('@','>',1)
                data = ''
                qual = ''
                line_id = 1
            elif (line_id == 1):
                if (line[0] == '+'):
                    line_id = 2
                else:
                    data += line.rstrip()
            elif (line_id == 2):
                qual += line.rstrip()
                if (len(qual) >= len(data)):
                    valid = True
                    line_id = 0

    if (valid):
        if (len(name) == 0 or len(data) == 0 or len(data) != len(qual)):
            eprint(len(name), len(data), len(qual))
            eprint('File is not in FASTQ format')
            sys.exit(1)
        if (name in read_set):
           print(name + '2')
        else:
           read_set.add(name)
           print(name + '1')
        print(data)

if __name__ == '__main__':

    read_set = set()

    if (len(sys.argv) > 1):
        parse_file(sys.argv[1], read_set)
    if (len(sys.argv) > 2):
        parse_file(sys.argv[2], read_set)

不幸的是,我遇到了以下错误:

> python3 shuffle_pair_end_reads.py R1.fq.gz R2.fq.gz 
Traceback (most recent call last):
  File "shuffle_pair_end_reads.py", line 65, in <module>
    parse_file(sys.argv[1], read_set)
  File "shuffle_pair_end_reads.py", line 33, in parse_file
    name = line.rstrip().split(' ')[0].replace('@','>',1)


TypeError: 'str' does not support the buffer interface

第一个输入文件的内容:

zcat R1.fq.gz 
@E00526:39:HNMN5CCXY:6:1101:3792:993 1:N:0:NTTCAGAA+NTTCGCCT
NTCATAACCATAGAATATGTAAATCTCTAAGTAGTAGTCTAAGACACCATATCCCTTTCGGGATCCGCCTATTCATTTATGCCCTACTACTTGAATACTTCTAATTATATTCCATTACTTTGCCTATCCAGTTCCGCTTGCATCGATATTT
+
#A<AFFFJJJ7FF7-A-J<FA-AF<FFFJA<JAF-A7-<-JFJJJ<<<JJJ--F7F<AJF-FFJFFFFF-J---7----<----A--77JFJA-----77-7---A----7----7--7--7----7---7---------7--7-------
@E00526:39:HNMN5CCXY:6:1101:5010:993 1:N:0:NTTCAGAA+NTTCGCCT
NACTCCTGTACATGGTCTAGATAGGGAGTATCTTGGAATTCTTGGTTGCCCATTATTCTCAAACCTGATGTATGAGTAAGTCCAACTCTCCGATCACTAGTTGTTCCTGAATATTCATGTAAACGGCCAGCTCGAGCCCTAAGATGCAGGT
+
#AAAF<FJJFFFJJJJJJFFJJJJJF7FAF<JFJJJA-FJFFF<FJAFFJJJJFAJJF-FJJJFJFJJFFJJFJF7F7-<---<FAFAJFFFA<F-<F--A--7A-77AJ7AFA<--7----7FF---77-7-7<-A7JJJ77-A---A7-

第二个输入文件的内容:

zcat R2.fq.gz 
@E00526:39:HNMN5CCXY:6:1101:3792:993 2:N:0:NTTCAGAA+NTTCGCCT
NTGGCGCTATTATTATCTCTAAGGGAAGTTAAGGATGAACTAGAAGCAGATAGATTGGTTAGACATGGTTGAACCGGCATAGCGATGGCGATGCATGCGGGGCTCGATGGCCGTCGTAATTGATTCTATTTTGAAGTATTCAAGTATTAGG
+
#<<A<A---FAFA<-FA7FFJ7AJFJ<F-FF-F<<-F-<FAF<-FJ7--<A-<-7<FJ-7-F-A-<AAFJJJ<-JFA--FF<-77AFF7<-AFJAF<F7AAA--7-7A-F-AF77A7-7-<<-7F--7<<J-A-7-AAJF7A<-<-7<A-<
@E00526:39:HNMN5CCXY:6:1101:5010:993 2:N:0:NTTCAGAA+NTTCGCCT
NTTGGAGTATGTTTTTCGAAGGAGCCGCAAACTTCAAAGGAGTGAGTATAGGAGCGGTTTTGGTATCAGAAAATGGTTAGTATTAATCGCTGTCCGCCAAGCTCAGATTCCCCTGCACCAACAATATGGCCGAGTACGCAGCCTGCATCTT
+
#AAAAFJJFJJJJJJJJJ<<FJFF-FFJJJJJFJJA<AFF-FJFAJAFJFFJ<AJJJAAFJFFJFJ<<FF<F--J7AAAJJJFFJJJJF<AJFJJJJJF7AJFJJJ-FAJ7FJFJJ<FJ7<FFFAA<FFJJF-A)-7A)7<JFFF-7-A--

我想念什么?

先谢谢您

0 个答案:

没有答案