为什么我的程序不能使用整数7和9,而是所有其他程序?

时间:2015-06-03 16:30:06

标签: python class bioinformatics

我已经将错误指向了make_barcode函数和gc_cont函数,但无法弄明白。此外,我不确定我是否正在使用类实例正确...我似乎无法生成具有45%到55%gc内容的随机条形码。

import argparse
import random
import numpy as np
import time

nucl_list = ['A', 'C', 'G', 'T']
barcode_list = []
tested = []
tests = 0


def SLdistance(s1, s2):
    """Calculates the hamming distance between s1 and s2"""

    # Initiate np array
    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=np.int)
    matrix[:, 0] = np.array([i for i in xrange(len(s1) + 1)])
    matrix[0, :] = np.array([i for i in xrange(len(s2) + 1)])

    # // Classical Levenshtein part
    for i in xrange(1, len(s1) + 1):
        for j in xrange(1, len(s2) + 1):
            cost = 0
            if s1[i - 1] != s2[j - 1]:
                cost = 1
            matrix[i, j] = min(matrix[(i - 1), (j - 1)] + cost,
                               matrix[i, (j - 1)] + 1,
                               matrix[(i - 1), j] + 1)
    min_distance = matrix[len(s1)][len(s2)]

    # New Sequence-Levenshtein part

    # Truncating
    for i in xrange(0, len(s1) + 1):
        min_distance = min(min_distance, matrix[i, len(s2)])
    # Elongating
    for j in xrange(0, len(s2) + 1):
        min_distance = min(min_distance, matrix[len(s1), j])
    return min_distance


def complement(barcode):
    """returns the complement of the barcode"""
    complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    complement = ''
    for base in barcode:
        complement += complement_dict[base]
    return complement


def compare_complements(new_barcode):
    """Returns a count > 0 if generated barcode is a complement of
    any in current list"""
    complement_count = 0
    global barcode_list
    for barcode in barcode_list:
        if complement(barcode) == complement(new_barcode):
            complement_count += 1
    return complement_count


class BarcodeGenerator(object):


    def __init__(self, **kwargs):
        for k, v in kwargs.iteritems():
            setattr(self, k, v)

    def gc_cont(self, barcode):
        """Returns the GC content of a barcode"""
        gc = 0.0
        for base in range(self.length):
            if barcode[base] == 'C' or barcode[base] == 'G':
                gc += 1
            else:
                gc += 0
        cont = gc / self.length
        return cont

    def make_barcode(self):
        """Generates a random barcode from nucl_list"""
        barcode = ''
        while barcode == '':
            for i in range(self.length):
                barcode += random.choice(nucl_list)
            if self.maxgc >= self.gc_cont(barcode) >= self.mingc:
                bar_code = barcode
            else:
                barcode = ''
        return bar_code

    def compare_distances(self, new_barcode):
        """Compares the sequence-Levenshtein distance between
        new barcode and old barcodes
        Uses the S-L distance depending on # errors
        to correct (2 * k + 1) k = errors"""
        # Count number of barcodes with 'bad' distances
        count = 0
        global barcode_list
        if self.errors == 1:
            for barcode in barcode_list:
                if SLdistance(new_barcode, barcode) < 3:
                    count += 1
        elif self.errors == 2:
            for barcode in barcode_list:
                if SLdistance(new_barcode, barcode) < 5:
                    count += 1
        return count

    def compare_repeat(self, barcode):
        """Returns a count > 0 if 2 consecutive bases in a barcode are the same"""
        count = 0
        for i in range(self.length - 1):
            if barcode[i] == barcode[i + 1]:
                count += 1
        return count

    def compare_barcodes(self):
        """Main, monster function...
        Compares a barcode list, which can correct up to 'num_errors'.
        Also does an ongoing comparison of new generated barcodes and checks for:
        1. The desired S-L distance between each barcode
        2. Excludes self-complements
        3. Excludes any barcodes that contain 2 duplicate consecutive bases"""
        global tests
        new_barcode = self.make_barcode()

        tests += 1
        if new_barcode not in tested:
            tested.append(new_barcode)
        if new_barcode not in barcode_list:
            distance_count = self.compare_distances(new_barcode)
            complement_count = compare_complements(new_barcode)
            repeat_count = self.compare_repeat(new_barcode)
            if distance_count > 0 or complement_count > 0 or repeat_count > 0:
                pass
            else:
                barcode_list.append(new_barcode)
        else:
            pass

    def generate(self):
        now = time.time()
        future = now + 5
        while len(barcode_list) < self.number_barcodes:
            self.compare_barcodes()
            if time.time() >= future:
                print "This is taking too much time...goodbye"
                break
            elif tests >= 200000:
                print "Generated ONLY {0} barcodes before termination".format(len(barcode_list))
                break

        barcode_list.sort()
        print "Correcting up to {0} error(s)".format(self.errors)
        if 1 == self.errors:
            min_dist = 3
        elif 2 == self.errors:
            min_dist = 5
        print "Created {0} barcodes of length {1}, with a S-L distance of at least {2} " \
              "and a gc content range between {3}% and {4}%.\n{5}"\
            .format(len(barcode_list), self.length, min_dist, self.mingc*100, self.maxgc*100, barcode_list)


def genArgParser():
    """
    Generate a command line argument parser for this script.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('length', nargs='?', default=6, type=int,
                        help='takes the number nucleotides for each barcode')
    parser.add_argument('number_barcodes', nargs='?', default=10, type=int,
                        help='takes the number of barcodes to generate')
    parser.add_argument('errors', nargs='?', default=1, type=int, choices=[1, 2],
                        help='gives the number of mismatches')
    parser.add_argument('mingc', nargs='?', default=45.0, type=float,
                        help='enter the minimum gc count')
    parser.add_argument('maxgc', nargs='?', default=55.0, type=float,
                        help='enter the maximum gc count')
    parser.add_argument('-v', '--verbose', action='store_true')
    return parser


def main():
    parser = genArgParser()
    args = parser.parse_args()
    mingc = args.mingc / 100
    maxgc = args.maxgc / 100
    if args.verbose:
        print 'Took the following arguments:\nlength: {0}\nnumber_barcodes: {1}\nerrors: {2}\n\
mingc: {3}\nmaxgc: {4}'.format(args.length, args.number_barcodes, args.errors, args.mingc, args.maxgc)

    generator = BarcodeGenerator(length=args.length, number_barcodes=args.number_barcodes, errors=args.errors,
                                 mingc=mingc, maxgc=maxgc)
    generator.generate()


if __name__ == '__main__':
    main()

感谢所有帮助人员!

1 个答案:

答案 0 :(得分:2)

您的make_barcode方法随机生成条形码,直到生成条形码为“GC含量”介于0.45和0.55之间的条形码。条形码的“GC内容”是它包含的“G”或“C”字符的数量,除以条形码的长度。

问题在于,不可能生成长度为7或9的条形码,GC含量在0.45之间。例如,取一个长度为7的字符串。如果它的GC含量为3/7 = .428,或GC含量为4/7 = .571,而不是GC含量在.45和.55之间。

因此make_barcode会随机生成条形码,但永远不会找到满足条件的条形码,因为实际上有 no 条形码满足条件。

您必须重新考虑mingcmaxgc条件才能使此代码生效。