我已经将错误指向了make_barcode函数和gc_cont函数,但无法弄明白。此外,我不确定我是否正在使用类实例正确...我似乎无法生成具有45%到55%gc内容的随机条形码。
import argparse
import random
import numpy as np
import time
nucl_list = ['A', 'C', 'G', 'T']
barcode_list = []
tested = []
tests = 0
def SLdistance(s1, s2):
"""Calculates the hamming distance between s1 and s2"""
# Initiate np array
matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=np.int)
matrix[:, 0] = np.array([i for i in xrange(len(s1) + 1)])
matrix[0, :] = np.array([i for i in xrange(len(s2) + 1)])
# // Classical Levenshtein part
for i in xrange(1, len(s1) + 1):
for j in xrange(1, len(s2) + 1):
cost = 0
if s1[i - 1] != s2[j - 1]:
cost = 1
matrix[i, j] = min(matrix[(i - 1), (j - 1)] + cost,
matrix[i, (j - 1)] + 1,
matrix[(i - 1), j] + 1)
min_distance = matrix[len(s1)][len(s2)]
# New Sequence-Levenshtein part
# Truncating
for i in xrange(0, len(s1) + 1):
min_distance = min(min_distance, matrix[i, len(s2)])
# Elongating
for j in xrange(0, len(s2) + 1):
min_distance = min(min_distance, matrix[len(s1), j])
return min_distance
def complement(barcode):
"""returns the complement of the barcode"""
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
complement = ''
for base in barcode:
complement += complement_dict[base]
return complement
def compare_complements(new_barcode):
"""Returns a count > 0 if generated barcode is a complement of
any in current list"""
complement_count = 0
global barcode_list
for barcode in barcode_list:
if complement(barcode) == complement(new_barcode):
complement_count += 1
return complement_count
class BarcodeGenerator(object):
def __init__(self, **kwargs):
for k, v in kwargs.iteritems():
setattr(self, k, v)
def gc_cont(self, barcode):
"""Returns the GC content of a barcode"""
gc = 0.0
for base in range(self.length):
if barcode[base] == 'C' or barcode[base] == 'G':
gc += 1
else:
gc += 0
cont = gc / self.length
return cont
def make_barcode(self):
"""Generates a random barcode from nucl_list"""
barcode = ''
while barcode == '':
for i in range(self.length):
barcode += random.choice(nucl_list)
if self.maxgc >= self.gc_cont(barcode) >= self.mingc:
bar_code = barcode
else:
barcode = ''
return bar_code
def compare_distances(self, new_barcode):
"""Compares the sequence-Levenshtein distance between
new barcode and old barcodes
Uses the S-L distance depending on # errors
to correct (2 * k + 1) k = errors"""
# Count number of barcodes with 'bad' distances
count = 0
global barcode_list
if self.errors == 1:
for barcode in barcode_list:
if SLdistance(new_barcode, barcode) < 3:
count += 1
elif self.errors == 2:
for barcode in barcode_list:
if SLdistance(new_barcode, barcode) < 5:
count += 1
return count
def compare_repeat(self, barcode):
"""Returns a count > 0 if 2 consecutive bases in a barcode are the same"""
count = 0
for i in range(self.length - 1):
if barcode[i] == barcode[i + 1]:
count += 1
return count
def compare_barcodes(self):
"""Main, monster function...
Compares a barcode list, which can correct up to 'num_errors'.
Also does an ongoing comparison of new generated barcodes and checks for:
1. The desired S-L distance between each barcode
2. Excludes self-complements
3. Excludes any barcodes that contain 2 duplicate consecutive bases"""
global tests
new_barcode = self.make_barcode()
tests += 1
if new_barcode not in tested:
tested.append(new_barcode)
if new_barcode not in barcode_list:
distance_count = self.compare_distances(new_barcode)
complement_count = compare_complements(new_barcode)
repeat_count = self.compare_repeat(new_barcode)
if distance_count > 0 or complement_count > 0 or repeat_count > 0:
pass
else:
barcode_list.append(new_barcode)
else:
pass
def generate(self):
now = time.time()
future = now + 5
while len(barcode_list) < self.number_barcodes:
self.compare_barcodes()
if time.time() >= future:
print "This is taking too much time...goodbye"
break
elif tests >= 200000:
print "Generated ONLY {0} barcodes before termination".format(len(barcode_list))
break
barcode_list.sort()
print "Correcting up to {0} error(s)".format(self.errors)
if 1 == self.errors:
min_dist = 3
elif 2 == self.errors:
min_dist = 5
print "Created {0} barcodes of length {1}, with a S-L distance of at least {2} " \
"and a gc content range between {3}% and {4}%.\n{5}"\
.format(len(barcode_list), self.length, min_dist, self.mingc*100, self.maxgc*100, barcode_list)
def genArgParser():
"""
Generate a command line argument parser for this script.
"""
parser = argparse.ArgumentParser()
parser.add_argument('length', nargs='?', default=6, type=int,
help='takes the number nucleotides for each barcode')
parser.add_argument('number_barcodes', nargs='?', default=10, type=int,
help='takes the number of barcodes to generate')
parser.add_argument('errors', nargs='?', default=1, type=int, choices=[1, 2],
help='gives the number of mismatches')
parser.add_argument('mingc', nargs='?', default=45.0, type=float,
help='enter the minimum gc count')
parser.add_argument('maxgc', nargs='?', default=55.0, type=float,
help='enter the maximum gc count')
parser.add_argument('-v', '--verbose', action='store_true')
return parser
def main():
parser = genArgParser()
args = parser.parse_args()
mingc = args.mingc / 100
maxgc = args.maxgc / 100
if args.verbose:
print 'Took the following arguments:\nlength: {0}\nnumber_barcodes: {1}\nerrors: {2}\n\
mingc: {3}\nmaxgc: {4}'.format(args.length, args.number_barcodes, args.errors, args.mingc, args.maxgc)
generator = BarcodeGenerator(length=args.length, number_barcodes=args.number_barcodes, errors=args.errors,
mingc=mingc, maxgc=maxgc)
generator.generate()
if __name__ == '__main__':
main()
感谢所有帮助人员!
答案 0 :(得分:2)
您的make_barcode
方法随机生成条形码,直到生成条形码为“GC含量”介于0.45和0.55之间的条形码。条形码的“GC内容”是它包含的“G”或“C”字符的数量,除以条形码的长度。
问题在于,不可能生成长度为7或9的条形码,GC含量在0.45之间。例如,取一个长度为7的字符串。如果它的GC含量为3/7 = .428,或GC含量为4/7 = .571,而不是GC含量在.45和.55之间。
因此make_barcode
会随机生成条形码,但永远不会找到满足条件的条形码,因为实际上有 no 条形码满足条件。
您必须重新考虑mingc
和maxgc
条件才能使此代码生效。