我正在尝试使用set匹配两个字符串与字符串的位置,但无法修复它。
在此功能中,我尝试在x3 = 'NKXD'
的位置和N can be T or A or G
的位置以及K can be L
的位置允许D can be E
的两次未匹配
我已定义spl_amino
,但不确定其工作能否任何人PLZ建议如何解决此问题?
被修改
脚本
import csv
def match(X,Y):
mismatch = 0
for x,y in zip(X,Y):
#print x, y
if not (x == 'X' or x == y):
mismatch += 1
if mismatch > 1:
return False
return True
def g4_match(X,Y):
spl_amino = set('T','A','G','L','E')
mismatch, spl_mismatch = 0, 0
for x,y in zip(X,Y):
print x, y
if not (x == 'X' and y == x):
if (y in spl_amino):
spl_mismatch += 1
else:
mismatch += 1
if mismatch > 1 or spl_mismatch > 1:
return False
return True
def mean(arr):
return (max(arr) + min(arr))/2
def H(protein,x1,x2,x3,x4, protein_name, pdb_id, source):
def find_matches(x, g4_match):
match_positions = []
matches = []
for i in range(len(protein) - len(x)):
candidate = protein[i : i + len(x)]
if match(x, candidate):
match_positions.append(i)
matches .append(candidate)
return matches, match_positions
L1, pL1 = find_matches(x1, match)
L2, pL2 = find_matches(x2, match)
L3, pL3 = find_matches(x3, g4_match)
L4, pL4 = find_matches(x4, match)
candidates = []
for a in zip(pL1, L1):
for b in zip(pL2, L2):
for c in zip(pL3, L3):
for d in zip(pL4, L4):
if (40 <= b[0] - a[0] <= 80 and
40 <= c[0] - b[0] <= 80 and
20 <= d[0] - c[0] <= 80 ):
#print(a,b,c,d)
candidates.append((a,b,c,d))
elif (80 <= b[0] - a[0] <= 120 and
40 <= c[0] - b[0] <= 80 and
120 <= d[0] - c[0] <= 180 ):
#print(a,b,c,d)
candidates.append((a,b,c,d))
with open('output_test.csv', 'a') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow([pdb_id])
wr.writerow([protein_name])
wr.writerow([protein])
wr.writerow([source])
for i in candidates:
wr.writerow([i])
return candidates
x1 = 'GXXXXGK'
x2 = 'DXXG'
x3 = 'NKXD'
x4 = 'EXSAX'
with open('input_file_1.csv') as infile:
lines = csv.reader(infile, delimiter=',', skipinitialspace=True, dialect=csv.excel_tab)
next(lines) # skip header
for line in lines:
protein = line[2]
protein_name = line[1]
pdb_id = line[0]
source = line[3]
H(protein,x1,x2,x3,x4, protein_name, pdb_id, source)
输入与csv文件中的内容类似
S No. PDB ID Protein Name Sequence Source
1 121P H-RAS P21 PROTEIN MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQH Homo sapiens
2 1A12 REGULATOR OF CHROMOSOME CONDENSATION 1 RRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIPEDVVQAEAGGMHTVCLSKSGQVYSFGCNDEGALGRDTSVEGSEMVPGKVELQEKVVQVSAGDSHTAALTDDGRVFLWGSFRDNNGVIGLLEPMKKSMVPVQVQLDVPVVKVASGNDHLVMLTADGDLYTLGCGEQGQLGRVPELFANRGGRQGLERLLVPKCVMLKSRGSRGHVRFQDAFCGAYFTFAISHEGHVYGFGLSNYHQLGTPGTESCFIPQNLTSFKNSTKSWVGFSGGQHHTVCMDSEGKAYSLGRAEYGRLGLGEGAEEKSIPTLISRLPAVSSVACGASVGYAVTKDGRVFAWGMGTNYQLGTGQDEDAWSPVEMMGKQLENRVVLSVSSGGQHTVLLVKDKEQS Homo sapiens