我正在尝试按如下方式修改以下程序:
第一行包含蛋白质的名称和该蛋白质的后续输出行数(比如N)
接下来的N行中的每一行都包含一个匹配信息:GBox的位置和实际的匹配(记住有扰动和X' s,即允许的外卡)。
脚本:
import csv
import matplotlib.pyplot as plt
import numpy as np
# all G boxes
def match(x,y):
mismatch = 0
for i in range(len(x)):
if (x[i] == 'X' or x[i] == y[i]):
pass
else:
mismatch += 1
if(mismatch <= 1):
return True
else:
return False
def H(protein,x1,x2,x3,x4):
pL1=[]
pL2=[]
pL3=[]
pL4=[]
L1=[]
L2=[]
L3=[]
L4=[]
for i in range(len(protein)-len(x1)):
if(match(x1, protein[i:i+len(x1)]) == True):
# global L1
pL1=pL1 + [i]
L1 = L1+[protein[i:i+len(x1)]]
for j in range(len(protein)-len(x2)):
if (match(x2, protein[j:j+len(x2)]) == True):
# global L2
pL2=pL2+[j]
L2 = L2+[protein[j:j+len(x2)]]
for k in range(len(protein)-len(x3)):
if (match(x3, protein[k:k+len(x3)]) == True):
# global L3
pL3=pL3+[k]
L3 = L3+[protein[k:k+len(x3)]]
for l in range(len(protein)-len(x4)):
if (match(x4, protein[l:l+len(x4)]) == True):
# global L3
pL4=pL4+[l]
L4 = L4+[protein[l:l+len(x4)]]
candidates = []
for i in range(len(pL1)):
for j in range(len(pL2)):
for k in range(len(pL3)):
for l in range(len(pL4)):
if 40 <=pL2[j]-pL1[i] <= 80 and 40 <=pL3[k]- pL2[j] <= 80 and 20 <=pL4[l]- pL3[k] <= 40:
a = L1[i],pL1[i]
b = L2[j],pL2[j]
c = L3[k],pL3[k]
d = L4[l],pL4[l]
print a,b,c,d
candidates.append((a,b,c,d))
offset = 5
for i in np.arange((np.array(candidates).transpose()).shape[1]):
barchartdata = np.unique(np.array(candidates).transpose()[:,i])
barchartdata = barchartdata.reshape(2, len(barchartdata)/2)
print barchartdata
x_pos = np.arange(barchartdata.size/2)
print x_pos
print barchartdata[0,:]
plt.bar(x_pos + 5 * i, barchartdata[0,:])
plt.show()
plt.xticks(x_pos, ('g1','g2','g3','g4'))
plt.yticks('Count')
plt.show()
x1 = 'GXXXXGK'
x2 = 'DXXG'
x3 = 'NKXD'
x4 = 'EXSAX'
#input sequence
protein = 'MAKGEFIRTKPHVNVGTIGHVDHGKTTLTAALTYVAAAENPNVEVKDYGEIDKAPEERARGITINTAHVEYETAKRHYSHVDCPGHADYIKNMITGAAQMDGAILVVSAADGPMPQTREHILLARQVGVPYIVVFMNKVDMVDDPELLDLVEMEVRDLLNQYEFPGDEVPVIRGSALLALEQMHRNPKTRRGENEWVDKIWELLDAIDEYIPTPVRDVDKPFLMPVEDVFTITGRGTVATGRIERGKVKVGDEVEIVGLAPETRKTVVTGVEMHRKTLQEGIAGDNVGVLLRGVSREEVERGQVLAKPGSITPHTKFEASVYVLKKEEGGRHTGFFSGYRPQFYFRTTDVTGVVQLPPGVEMVMPGDNVTFTVELIKPVALEEGLRFAIREGGRTVGAGVVTKILE'
H(protein,x1,x2,x3,x4)
修改
上一个输出(我的脚本) - 正确:
('GAGGVGK', 9) ('DILD', 53) ('NKCD', 115) ('ETSAK', 142)
('GAGGVGK', 9) ('DTAG', 56) ('NKCD', 115) ('ETSAK', 142)
('GAGGVGK', 9) ('DQYM', 68) ('NKCD', 115) ('ETSAK', 142)
('GAGGVGK', 9) ('MRTG', 71) ('NKCD', 115) ('ETSAK', 142)
('GAGGVGK', 9) ('TGEG', 73) ('NKCD', 115) ('ETSAK', 142)
在脚本中获取输出:
((17, 'GAGGVGK'), (61, 'DILD'), (123, 'NKCD'), (150, 'ETSAK'))
((17, 'GAGGVGK'), (64, 'DTAG'), (123, 'NKCD'), (150, 'ETSAK'))
((17, 'GAGGVGK'), (76, 'DQYM'), (123, 'NKCD'), (150, 'ETSAK'))
((17, 'GAGGVGK'), (79, 'MRTG'), (123, 'NKCD'), (150, 'ETSAK'))
((17, 'GAGGVGK'), (81, 'TGEG'), (123, 'NKCD'), (150, 'ETSAK'))
这是长度不正确。 我需要运行多个序列,但它只运行一个序列。请指导我
我也试图绘制图表,但它无法获得预期的输出。
预期图表:
在此图片中是预期图表 - 我们需要逐列计算百分比 - 请检查“以前的输出(我的脚本) - 正确:请查看下面的图片”。
编辑1
输入文件是CSV文件,格式如下(多行):
PDB ID Macromolecule Name Sequence Source
121P H-RAS P21 PROTEIN MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQH Homo sapiens
1A12 REGULATOR OF CHROMOSOME CONDENSATION 1 RRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIPEDVVQAEAGGMHTVCLSKSGQVYSFGCNDEGALGRDTSVEGSEMVPGKVELQEKVVQVSAGDSHTAALTDDGRVFLWGSFRDNNGVIGLLEPMKKSMVPVQVQLDVPVVKVASGNDHLVMLTADGDLYTLGCGEQGQLGRVPELFANRGGRQGLERLLVPKCVMLKSRGSRGHVRFQDAFCGAYFTFAISHEGHVYGFGLSNYHQLGTPGTESCFIPQNLTSFKNSTKSWVGFSGGQHHTVCMDSEGKAYSLGRAEYGRLGLGEGAEEKSIPTLISRLPAVSSVACGASVGYAVTKDGRVFAWGMGTNYQLGTGQDEDAWSPVEMMGKQLENRVVLSVSSGGQHTVLLVKDKEQS Homo sapiens
1A2B TRANSFORMING PROTEIN RHOA SMAAIRKKLVIVGDVACGKTCLLIVFSKDQFPEVYVPTVFENYVADIEVDGKQVELALWDTAGQEDYDRLRPLSYPDTDVILMCFSIDSPDSLENIPEKWTPEVKHFCPNVPIILVGNKKDLRNDEHTRRELAKMKQEPVKPEEGRDMANRIGAFGYMECSAKTKDGVREVFEMATRAALQA Homo sapiens
1A2K NUCLEAR TRANSPORT FACTOR 2 MGDKPIWEQIGSSFIQHYYQLFDNDRTQLGAIYIDASCLTWEGQQFQGKAAIVEKLSSLPFQKIQHSITAQDHQPTPDSCIISMVVGQLKADEDPIMGFHQMFLLKNINDAWVCTNDMFRLALHNFG Rattus norvegicus
答案 0 :(得分:1)
你的代码非常笨拙。我已经重写了它以便清理一下。我已经删除了有关我选择更改的内容和原因的所有背景信息,因为它的内容太多了。但是,这里有一些我修复过的东西:
您倾向于以风格
编写unpythonic循环for i in range(len(x)):
do_something_with(x[i])
在Python中,编写
要好得多for element in x:
do_something_with(element)
if
附近有很多多余的括号 - 条件
for
- 循环四次,而不是编写一个包含循环的函数并重复使用它。不幸的是20
打破了{4}}循环中40
s所建立的模式,将其替换为for
上的单个循环a比它值得多麻烦。
无论如何,这是代码的清理版本。
itertools.product
当我执行上面的代码时,我得到输出
import csv
import matplotlib.pyplot as plt
import numpy as np
def match(X,Y):
mismatch = 0
for x,y in zip(X,Y):
if not (x == 'X' or x == y):
mismatch += 1
if mismatch > 1:
return False
return True
def H(protein,x1,x2,x3,x4):
def find_matches(x):
match_positions = []
matches = []
for i in range(len(protein) - len(x)):
candidate = protein[i : i + len(x)]
if match(x, candidate):
match_positions.append(i)
matches .append(candidate)
return matches, match_positions
L1, pL1 = find_matches(x1)
L2, pL2 = find_matches(x2)
L3, pL3 = find_matches(x3)
L4, pL4 = find_matches(x4)
candidates = []
for a in zip(pL1, L1):
for b in zip(pL2, L2):
for c in zip(pL3, L3):
for d in zip(pL4, L4):
if (40 <= b[0] - a[0] <= 80 and
40 <= c[0] - b[0] <= 80 and
20 <= d[0] - c[0] <= 80 ):
print(a,b,c,d)
candidates.append((a,b,c,d))
for i in np.arange((np.array(candidates).transpose()).shape[1]):
barchartdata = np.unique(np.array(candidates).transpose()[:,i])
barchartdata = barchartdata.reshape(2, len(barchartdata)//2)
print (barchartdata)
x_pos = np.arange(barchartdata.size/2)
print (x_pos)
print (barchartdata[0,:])
plt.bar(x_pos + 5 * i, barchartdata[0,:])
plt.show()
plt.xticks(x_pos, ('g1','g2','g3','g4'))
plt.yticks('Count')
plt.show()
x1 = 'GXXXXGK'
x2 = 'DXXG'
x3 = 'NKXD'
x4 = 'EXSAX'
protein = 'MAKGEFIRTKPHVNVGTIGHVDHGKTTLTAALTYVAAAENPNVEVKDYGEIDKAPEERARGITINTAHVEYETAKRHYSHVDCPGHADYIKNMITGAAQMDGAILVVSAADGPMPQTREHILLARQVGVPYIVVFMNKVDMVDDPELLDLVEMEVRDLLNQYEFPGDEVPVIRGSALLALEQMHRNPKTRRGENEWVDKIWELLDAIDEYIPTPVRDVDKPFLMPVEDVFTITGRGTVATGRIERGKVKVGDEVEIVGLAPETRKTVVTGVEMHRKTLQEGIAGDNVGVLLRGVSREEVERGQVLAKPGSITPHTKFEASVYVLKKEEGGRHTGFFSGYRPQFYFRTTDVTGVVQLPPGVEMVMPGDNVTFTVELIKPVALEEGLRFAIREGGRTVGAGVVTKILE'
H(protein,x1,x2,x3,x4)
当我在撰写本文时执行您问题中可见的代码时,我会得到输出
((3, 'GEFIRTK'), (57, 'RARG'), (136, 'NKVD'), (172, 'RGSAL'))
((3, 'GEFIRTK'), (81, 'DCPG'), (136, 'NKVD'), (172, 'RGSAL'))
((18, 'GHVDHGK'), (81, 'DCPG'), (136, 'NKVD'), (172, 'RGSAL'))
((18, 'GHVDHGK'), (87, 'DYIK'), (136, 'NKVD'), (172, 'RGSAL'))
((18, 'GHVDHGK'), (92, 'MITG'), (136, 'NKVD'), (172, 'RGSAL'))
您是否完全确定您在相同数据上运行了两个代码?请注意,您的代码使用的是('GEFIRTK', 3) ('RARG', 57) ('NKVD', 136) ('RGSAL', 172)
('GEFIRTK', 3) ('DCPG', 81) ('NKVD', 136) ('RGSAL', 172)
('GHVDHGK', 18) ('DCPG', 81) ('NKVD', 136) ('RGSAL', 172)
('GHVDHGK', 18) ('DYIK', 87) ('NKVD', 136) ('RGSAL', 172)
('GHVDHGK', 18) ('MITG', 92) ('NKVD', 136) ('RGSAL', 172)
硬连接到代码中,而我的代码 则使用示例CSV文件内容中的数据。前者以'MAKG'开头,后者以'MTEY'开头。这些数据不一样,所以你会期望它们给出不同的结果!
请注意,我上面显示的代码的编辑版本已恢复使用代码本身的硬连线protein
,会为您提供相同的结果
以下是一个代码示例,演示了如何从CSV文件中读取蛋白质数据:
protein