I have a raw, unaligned fastq.gz file that I am trying to preprocess using Biopython before alignment. I would ultimately like to remove low quality reads, trim polyA tails, trim adapters using fuzzy matching, and finally remove reads that do not satisfy a length requirement after all said preprocessing. It would also be neat to specify how many reads satisfy the filtering criteria at each step. I have been playing around with this Biopython script but have had little success.
# coding: utf-8
'''
process_reads.py
March 31, 2016
Convert FQ files to unaligned and tagged BAMs
I: compressed FQ
O: compressed, filtered FQ
'''
import gzip, statistics
from Bio import SeqIO, pairwise2
def get_stats(reads):
sizes = [len(r) for r in reads]
print("Total reads: %i" % len(sizes))
print("Mean read length: %i" % statistics.mean(sizes))
print("Max. read length: %i" % max(sizes))
print("Min. read length: %i" % min(sizes))
def quality_filter(reads, qual):
return (r for r in reads if min(r.letter_annotations["phred_quality"]) >= qual)
def trim_polyA(records, numA, minLen):
for record in records:
if len(record) < minLen: continue
record = record.seq.split("A"*numA, 1)[0]
yield record
def _remove_adaptor(seq, region, right_side=True):
if right_side:
try:
pos = seq.find(region)
# handle Biopython SeqRecords
except AttributeError:
pos = seq.seq.find(region)
return seq[:pos]
else:
try:
pos = seq.rfind(region)
# handle Biopython SeqRecords
except AttributeError:
pos = seq.seq.rfind(region)
return seq[pos+len(region):]
def trim_adaptor(seq, adaptor, num_errors, right_side=True):
gap_char = '-'
exact_pos = str(seq).find(adaptor)
if exact_pos >= 0:
seq_region = str(seq[exact_pos:exact_pos+len(adaptor)])
adapt_region = adaptor
else:
seq_a, adaptor_a, score, start, end = pairwise2.align.localms(str(seq),
str(adaptor),
5.0, -4.0, -9.0, -0.5,
one_alignment_only=True,
gap_char=gap_char)[0]
adapt_region = adaptor_a[start:end]
seq_region = seq_a[start:end]
matches = sum((1 if s == adapt_region[i] else 0) for i, s in enumerate(seq_region))
# too many errors -- no trimming
if (len(adaptor) - matches) > num_errors:
return seq
# remove the adaptor sequence and return the result
else:
return _remove_adaptor(seq, seq_region.replace(gap_char, ""),
right_side)
def process_reads(fq, qual, adapt, numA, minLen):
with gzip.open(fq) as f:
rawReads = SeqIO.parse(f, "fastq")
# get_stats(rawReads) # When I run this, everything downstream fails..
qualFil = quality_filter(rawReads, qual) # I think this work fine.
trimmedPoly = trim_polyA(qualFil, numA, minLen)
trimmedAdap = trim_adaptor(trimmedPoly, adapt, 2)
# count = SeqIO.write(trimmedAdap, "good_quality.fastq", "fastq")
# print(count)
# TEST PROCESSING
fq = "test/TAAGGCGA_2.fq.gz"
process_reads(fq, qual=50, adapt="AAGCAGTGGTATCAACGCAGAGTGAATGGG", numA=6, minLen=20)
I believe the quality filter and polyA trimming works correctly but I cannot seem to get the adapters to cut. I have also written a function called get_stats
that is supposed to return the average length and total reads. I would appreciate any help!
答案 0 :(得分:0)
我想知道right_side
始终设置为True
。因此,您的_remove_adaptor
方法只会尝试从右侧删除适配器:
... if right_side: try: pos = seq.find(region) # handle Biopython SeqRecords except AttributeError: pos = seq.seq.find(region) return seq[:pos]
但是如果你从右边切掉polyA(保留序列的左边部分),你还会在右边有一个适配器序列吗?我的猜测是你要从左侧切割适配器。
一些示例读取会有所帮助。