Question

这是我提出的上一个问题的后续问题：Processing a sub-list of variable size within a larger list。

我设法使用itertools来获取DNA片段组，但现在我遇到了一个不同的问题。

我需要根据这些DNA片段组设计引物。通过包括来自不同DNA片段的重叠来设计引物。假设我在列表中有三个DNA片段，片段A，B和C.我需要提取：

C的最后20个核苷酸（n.t.）与前40n.t.连接（按顺序）。的，
前20 n.t.的反向补码（RC） B与最后n.t.的RC顺序连接的，
最近20 n.t. A与前40°连接B，
前20 n的RC C与过去40 n.t.的RC连接B，
最近20 n.t. C与第40个n.t.连接的，
前20 n的RC A与最后40 n.t.的RC连接C。

我似乎无法解决这个问题，而且我不确定在哪里开始这个问题的最佳位置......

到目前为止我已编写的代码仅输出“组1”（故意，因此我可以最小化我正在处理的视觉输出量）。这是：

#import BioPython Tools
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

#import csv tools
import csv
import sys
import os
import itertools

with open('constructs-to-make.csv', 'rU') as constructs:
    construct_list = csv.DictReader(constructs)
    def get_construct_number(row):
        return row["Construct"]
    def get_strategy(row):
        return row["Strategy"]
##    construct_list.next()
##    construct_number = 1
    primer_list = []
##    temp_list = []
##    counter = 2
    groups = []

##    for row in construct_list:
##        print(row)
##
    for key, items in itertools.groupby(construct_list, key=get_construct_number):
        for subitems in items:
            #here, I am trying to get the annealing portion of the Gibson sequence out
            if subitems['Strategy'] == 'Gibson' and subitems['Construct'] == '1':
                print(subitems['Construct'])
                fw_anneal = Seq(subitems['Sequence'][0:40], IUPAC.unambiguous_dna)
                print(fw_anneal)
                re_anneal = Seq(subitems['Sequence'][-40:], IUPAC.unambiguous_dna).reverse_complement()
                print(re_anneal)
                fw_overhang = Seq(subitems['Sequence'][0:20], IUPAC.unambiguous_dna).reverse_complement()
                print(fw_overhang)
                re_overhang = Seq(subitems['Sequence'][-20:], IUPAC.unambiguous_dna)
                print(re_overhang)

非常感谢任何帮助！

Answer 1

Martineau可能拥有使用特定领域语言的正确理念。我对此没有任何经验，但这是我半小时内提出来的。

我没有运行或调试或测试过这个，但如果您有任何疑问，请告诉我。此代码还假定片段长度不会长到将它们保存在内存中的问题。如果这种假设不成立，这种方法效率不高。

我在片段词典的设计中也很懒。它实际上不应该是一个全局变量，整个代码应该在一个类中。

def head(seq, count):
    return seq[count:]

def tail(seq, count):
    return seq[:count]

def reverse(nt):
    if nt in 'Gg':
        return 'C'
    elif nt in 'Cc':
        return 'G'
    elif nt in 'Aa':
        return 'T'
    elift nt in 'Tt':
        return 'A'
    raise Exception('invalid nucleotide')

def complement(seq):
    return (reverse(i) for i in seq)

def identity(seq):
    return seq

fragments = {}
def register(fragment, name):
    if name not in fragments:
        fragments[name] = fragment
    else:
        raise Exception('a fragment of that name is already registered!')

def make_combination(portions):
    """ Each entry in portions should be a tuple consisting of:
    (sequence name, count, accessor function, complement function)"""
    output = []
    for entry in portions:
        name, count, select, order = entry
        section = order(select(fragments[name], count))
        output.append(str(section))
    # return a string, not a list
    return ''.join(output)

def example()
    a = 'gattaca'
    b = 'tagacat'
    c = 'gagatac'
    register(a, 'a')
    register(b, 'b')
    register(c, 'c')
    # the last 20 nucleotides (n.t.) of C with the first 40 n.t. of A
    first = (('c', 20, tail, identity),
             ('a', 40, head, identity))
    # RC of the first 20 n.t. of B with the RC of the last n.t. of A,
    second = (('b', 20, head, complement),
              ('a', 1, tail, identity))

Answer 2

我最终使用一堆条件来解决这个问题。

代码不够优雅，并且涉及大量重复，但对于我将反复使用的快速和脏的脚本，我认为它就足够了。

##here, i process all the gibson primers to get the final list of primers##
##=======================================================================##
    construct_num = 1
    temp = []
    part_num = 1
    temp_row_num = 1
    max_seq_num = 0

    for row in gibson_primer_temp_list:

        max_seq_num = 0

        for x in gibson_primer_temp_list:
            if int(x[1]) > construct_num:
                pass
            if int(x[1]) == construct_num:
                max_seq_num += 1
##        print('Const. number counter is at ' + str(construct_num) + ' and current maximum known number of sequences is ' + str(max_seq_num))

##        print(row[1])

##        if int(row[1]) < construct_num:
##            while construct_num < int(row[1]):
##        print(max_seq_num)
##        for row in gibson_primer_temp_list:
##            if int(row[1]) == construct_num:
##                max_seq_num += 1
##            if int(row[1]) > construct_num:
##                break

        #print('Construct number is ' + str(row[1]) + ' and seq. number is ' + str(row[4]))
        #print('Const. number counter is ' + str(construct_num) + ' and max. seq. number is ' + str(max_seq_num) + '.')

        if int(row[1]) > construct_num:
            part_num = 1
            while construct_num < int(row[1]):
                #print('Construct number is ' + str(construct_num))
                construct_num += 1
##                temp_row_num += 1 #do not uncomment
            #continue - not to be added back again!

        if int(row[1]) == construct_num:

            if int(row[4]) == max_seq_num:

                #print(row)
                temp.append(row)
                temp_row_num += 1
                #print('We are going to make primers that join the first and last part in construct ' + str(construct_num))
                #print('Grabbing overhang portion from part ' + str(part_num) + ', which is sequence ' + str(row[4]) + '. It has the sequence ' + str(row[0]))
                overhang = row
                #print('Grabbing the first sequence...')
                for x in gibson_primer_temp_list:
                    #print(row[1] == x[1] and x[4] == 1)
                    if row[1] == x[1] and x[4] == 1:
                        #print(x[0])
                        anneal = x
                        #print('The first sequence is ' + str(anneal))
                        fw_primer = overhang[0] + anneal [0]
                        #print('The forward primer on the first part is: ' + str(fw_primer))
                        primer_list.append([fw_primer, construct_num, x[2], 'fw primer'])
                        break

                #print('Grabbing the third sequence...')
                for y in gibson_primer_temp_list:
                    #print(row[1] == y[1] and y[4] == 3)
                    if row[1] == y[1] and y[4] == 3:
                        #print(y[0])
                        overhang = y
                        #print('The third sequence is ' + str(overhang))
                        break

                #print('Grabbing the (n-2)th sequence...')
                steps_backward = 2
                target_seq_num = max_seq_num - steps_backward
                for z in gibson_primer_temp_list:
                    #print(row[1] == z[1] and z[4] == target_seq_num)
                    if row[1] == z[1] and z[4] == target_seq_num:
                        #print(z[0])
                        anneal = z
                        #print('The n-2th sequence is ' + str(anneal))
                        break

                re_primer = overhang[0] + anneal[0]
                primer_list.append([re_primer, construct_num, z[2], 're primer'])
                continue

            if part_num == int(row[2]) and part_num == 1: #if the part number counter = part number
                #print(row)
                temp.append(row)
                temp_row_num += 1
                continue #do NOT delete this continue

            if part_num < int(row[2]):
                #print('Current part is: ' + str(part_num) + '. Upping part number.' + '\n')
                part_num += 1
                #do NOT add in a "continue" here


            if part_num == int(row[2]) and row[3] == 'fp_anneal':
                #print(row)
                temp.append(row)
                temp_row_num += 1
                #print('Current part is: ' + str(part_num))
                #print('Grabbing tp_overhang from part ' + str(part_num - 1) + '...')
                x = 1
                for row in temp:
                    x += 1
                    if x == temp_row_num - 1:
                        prev_tp_overhang = row
                #print('Sequence of tp_overhang from part ' + str(part_num - 1) + ' is: ' + prev_tp_overhang[0])
                fw_primer_current = prev_tp_overhang[0] + row[0]
                #print('Appending to master primer list...')
                primer_list.append([fw_primer_current, construct_num, part_num, 'fw primer'])
                #print('Forward primer is: ' + str(fw_primer_current) + '\n')
                continue

            if part_num == int(row[2]) and row[3] == 'tp_anneal':
                #print(row)
                temp.append(row)
                temp_row_num += 1
                continue


            if part_num == int(row[2]) and row[3] == 'fp_overhang':
                #print(row)
                temp.append(row)
                temp_row_num += 1
                #print('Current temp_row_num is ' + str(temp_row_num))
                #print('Current part is: ' + str(part_num))
                #print('Grabbing tp_anneal from part ' + str(part_num - 1) + '...')
                x = 1
                for row in temp:
                    x += 1
                    if x == temp_row_num - 5:
                        prev_tp_anneal = row
                        #print(row)
                        pass
                #print('Sequence of tp_anneal from part ' + str(part_num - 1) + ' is: ' + prev_tp_anneal[0])
                re_primer_prev = row[0] + prev_tp_anneal[0]
                #print('Appending to master primer list...')
                primer_list.append([re_primer_prev, construct_num, part_num - 1, 're primer'])
                #print('Reverse primer for previous part is: ' + str(re_primer_prev) + '\n')
                part_num += 1
                continue

            if part_num == int(row[2]) and row[3] == 'tp_overhang':
                #print(row)
                temp.append(row)
                temp_row_num += 1
                continue

            continue

感谢大家的帮助！

迭代一个列表，我必须从第一个项目中获取数据，以便在最后一个项目中使用

2 个答案: