这是我提出的上一个问题的后续问题:Processing a sub-list of variable size within a larger list。
我设法使用itertools来获取DNA片段组,但现在我遇到了一个不同的问题。
我需要根据这些DNA片段组设计引物。通过包括来自不同DNA片段的重叠来设计引物。假设我在列表中有三个DNA片段,片段A,B和C.我需要提取:
我似乎无法解决这个问题,而且我不确定在哪里开始这个问题的最佳位置......
到目前为止我已编写的代码仅输出“组1”(故意,因此我可以最小化我正在处理的视觉输出量)。这是:
#import BioPython Tools
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
#import csv tools
import csv
import sys
import os
import itertools
with open('constructs-to-make.csv', 'rU') as constructs:
construct_list = csv.DictReader(constructs)
def get_construct_number(row):
return row["Construct"]
def get_strategy(row):
return row["Strategy"]
## construct_list.next()
## construct_number = 1
primer_list = []
## temp_list = []
## counter = 2
groups = []
## for row in construct_list:
## print(row)
##
for key, items in itertools.groupby(construct_list, key=get_construct_number):
for subitems in items:
#here, I am trying to get the annealing portion of the Gibson sequence out
if subitems['Strategy'] == 'Gibson' and subitems['Construct'] == '1':
print(subitems['Construct'])
fw_anneal = Seq(subitems['Sequence'][0:40], IUPAC.unambiguous_dna)
print(fw_anneal)
re_anneal = Seq(subitems['Sequence'][-40:], IUPAC.unambiguous_dna).reverse_complement()
print(re_anneal)
fw_overhang = Seq(subitems['Sequence'][0:20], IUPAC.unambiguous_dna).reverse_complement()
print(fw_overhang)
re_overhang = Seq(subitems['Sequence'][-20:], IUPAC.unambiguous_dna)
print(re_overhang)
非常感谢任何帮助!
答案 0 :(得分:0)
Martineau可能拥有使用特定领域语言的正确理念。我对此没有任何经验,但这是我半小时内提出来的。
我没有运行或调试或测试过这个,但如果您有任何疑问,请告诉我。此代码还假定片段长度不会长到将它们保存在内存中的问题。如果这种假设不成立,这种方法效率不高。
我在片段词典的设计中也很懒。它实际上不应该是一个全局变量,整个代码应该在一个类中。
def head(seq, count):
return seq[count:]
def tail(seq, count):
return seq[:count]
def reverse(nt):
if nt in 'Gg':
return 'C'
elif nt in 'Cc':
return 'G'
elif nt in 'Aa':
return 'T'
elift nt in 'Tt':
return 'A'
raise Exception('invalid nucleotide')
def complement(seq):
return (reverse(i) for i in seq)
def identity(seq):
return seq
fragments = {}
def register(fragment, name):
if name not in fragments:
fragments[name] = fragment
else:
raise Exception('a fragment of that name is already registered!')
def make_combination(portions):
""" Each entry in portions should be a tuple consisting of:
(sequence name, count, accessor function, complement function)"""
output = []
for entry in portions:
name, count, select, order = entry
section = order(select(fragments[name], count))
output.append(str(section))
# return a string, not a list
return ''.join(output)
def example()
a = 'gattaca'
b = 'tagacat'
c = 'gagatac'
register(a, 'a')
register(b, 'b')
register(c, 'c')
# the last 20 nucleotides (n.t.) of C with the first 40 n.t. of A
first = (('c', 20, tail, identity),
('a', 40, head, identity))
# RC of the first 20 n.t. of B with the RC of the last n.t. of A,
second = (('b', 20, head, complement),
('a', 1, tail, identity))
答案 1 :(得分:0)
我最终使用一堆条件来解决这个问题。
代码不够优雅,并且涉及大量重复,但对于我将反复使用的快速和脏的脚本,我认为它就足够了。
##here, i process all the gibson primers to get the final list of primers##
##=======================================================================##
construct_num = 1
temp = []
part_num = 1
temp_row_num = 1
max_seq_num = 0
for row in gibson_primer_temp_list:
max_seq_num = 0
for x in gibson_primer_temp_list:
if int(x[1]) > construct_num:
pass
if int(x[1]) == construct_num:
max_seq_num += 1
## print('Const. number counter is at ' + str(construct_num) + ' and current maximum known number of sequences is ' + str(max_seq_num))
## print(row[1])
## if int(row[1]) < construct_num:
## while construct_num < int(row[1]):
## print(max_seq_num)
## for row in gibson_primer_temp_list:
## if int(row[1]) == construct_num:
## max_seq_num += 1
## if int(row[1]) > construct_num:
## break
#print('Construct number is ' + str(row[1]) + ' and seq. number is ' + str(row[4]))
#print('Const. number counter is ' + str(construct_num) + ' and max. seq. number is ' + str(max_seq_num) + '.')
if int(row[1]) > construct_num:
part_num = 1
while construct_num < int(row[1]):
#print('Construct number is ' + str(construct_num))
construct_num += 1
## temp_row_num += 1 #do not uncomment
#continue - not to be added back again!
if int(row[1]) == construct_num:
if int(row[4]) == max_seq_num:
#print(row)
temp.append(row)
temp_row_num += 1
#print('We are going to make primers that join the first and last part in construct ' + str(construct_num))
#print('Grabbing overhang portion from part ' + str(part_num) + ', which is sequence ' + str(row[4]) + '. It has the sequence ' + str(row[0]))
overhang = row
#print('Grabbing the first sequence...')
for x in gibson_primer_temp_list:
#print(row[1] == x[1] and x[4] == 1)
if row[1] == x[1] and x[4] == 1:
#print(x[0])
anneal = x
#print('The first sequence is ' + str(anneal))
fw_primer = overhang[0] + anneal [0]
#print('The forward primer on the first part is: ' + str(fw_primer))
primer_list.append([fw_primer, construct_num, x[2], 'fw primer'])
break
#print('Grabbing the third sequence...')
for y in gibson_primer_temp_list:
#print(row[1] == y[1] and y[4] == 3)
if row[1] == y[1] and y[4] == 3:
#print(y[0])
overhang = y
#print('The third sequence is ' + str(overhang))
break
#print('Grabbing the (n-2)th sequence...')
steps_backward = 2
target_seq_num = max_seq_num - steps_backward
for z in gibson_primer_temp_list:
#print(row[1] == z[1] and z[4] == target_seq_num)
if row[1] == z[1] and z[4] == target_seq_num:
#print(z[0])
anneal = z
#print('The n-2th sequence is ' + str(anneal))
break
re_primer = overhang[0] + anneal[0]
primer_list.append([re_primer, construct_num, z[2], 're primer'])
continue
if part_num == int(row[2]) and part_num == 1: #if the part number counter = part number
#print(row)
temp.append(row)
temp_row_num += 1
continue #do NOT delete this continue
if part_num < int(row[2]):
#print('Current part is: ' + str(part_num) + '. Upping part number.' + '\n')
part_num += 1
#do NOT add in a "continue" here
if part_num == int(row[2]) and row[3] == 'fp_anneal':
#print(row)
temp.append(row)
temp_row_num += 1
#print('Current part is: ' + str(part_num))
#print('Grabbing tp_overhang from part ' + str(part_num - 1) + '...')
x = 1
for row in temp:
x += 1
if x == temp_row_num - 1:
prev_tp_overhang = row
#print('Sequence of tp_overhang from part ' + str(part_num - 1) + ' is: ' + prev_tp_overhang[0])
fw_primer_current = prev_tp_overhang[0] + row[0]
#print('Appending to master primer list...')
primer_list.append([fw_primer_current, construct_num, part_num, 'fw primer'])
#print('Forward primer is: ' + str(fw_primer_current) + '\n')
continue
if part_num == int(row[2]) and row[3] == 'tp_anneal':
#print(row)
temp.append(row)
temp_row_num += 1
continue
if part_num == int(row[2]) and row[3] == 'fp_overhang':
#print(row)
temp.append(row)
temp_row_num += 1
#print('Current temp_row_num is ' + str(temp_row_num))
#print('Current part is: ' + str(part_num))
#print('Grabbing tp_anneal from part ' + str(part_num - 1) + '...')
x = 1
for row in temp:
x += 1
if x == temp_row_num - 5:
prev_tp_anneal = row
#print(row)
pass
#print('Sequence of tp_anneal from part ' + str(part_num - 1) + ' is: ' + prev_tp_anneal[0])
re_primer_prev = row[0] + prev_tp_anneal[0]
#print('Appending to master primer list...')
primer_list.append([re_primer_prev, construct_num, part_num - 1, 're primer'])
#print('Reverse primer for previous part is: ' + str(re_primer_prev) + '\n')
part_num += 1
continue
if part_num == int(row[2]) and row[3] == 'tp_overhang':
#print(row)
temp.append(row)
temp_row_num += 1
continue
continue
感谢大家的帮助!