我正在编写一个Python脚本,它将从.csv文件中获取字符串并以特定方式剪切它们(“在没有P后跟着R和K后切割”),允许缺少最多两个切割位点和将生成的字符串写入新的.csv文件。这就像一个魅力,但...... ...
然后我需要这些字符串的每一个并搜索另一个.csv文件(大约有725000个条目)并查看字符串是否列在大文件中。如果是,则将它们写入单独的文件。我已成功完成此操作(请参阅下面的代码),但它速度超慢......我将大文件切换到大约2000个条目而不是725000,并且需要15秒(意味着整个文件大约需要90分钟)。这太慢了!如何减少这个计算时间?
import csv
import re
import time
# Input should be a .csv file with 2 columns (PrEST ID, PrEST Sequence)
in_file = open('Tryptic Sequences Input.csv','r')
in_file1 = open('Reference Peptides (ENSG, Björn) TEST.csv','r')
out_file = open('Tryptic Sequences Output.csv','w+')
out_file1 = open('Tryptic Sequences Output (non-unique peptides).csv','w+')
# Reader/Writer iterables
reader = csv.reader(in_file)
reader1 = csv.reader(in_file1)
in_list = list(reader)
in_list1 = list(reader1)
writer = csv.writer(out_file)
writer1 = csv.writer(out_file1)
headers = ('PrEST','Peptide')
writer.writerow(headers)
writer1.writerow(headers)
# Initiate variables
Peptide_list = [] # List for Peptides (resets for each PrEST)
ID_list = [] # List for PrEST IDs (resets for each PrEST)
Copy_list = [] # List for non-unique tryptic peptides
Copy_ID_list = []
Peptide = '' # Current peptide (no missed cleavages)
Peptide_MC1 = '' # Current peptide with 1 missed cleavage
Peptide_MC2 = '' # Current peptide with 2 missed cleavages
MC1 = 'N'
MC2 = 'N'
Unique = 'Y'
t0 = time.clock()
# ------ Main PrEST for-loop -------
for row in range(len(in_list)): # For every PrEST (row)
First = 'Y'
PrEST_seq = in_list[row][1]
# -------- Main AA-reader for-loop --------
for n in range(len(PrEST_seq)): # For every AA in every PrEST
if ((PrEST_seq[n:n+1] == 'R' or
PrEST_seq[n:n+1] == 'K') and
PrEST_seq[n+1:n+2] != 'P'):
if First != 'Y': # Does not count first peptide + MCs (part of ABP)
Peptide += PrEST_seq[n:n+1]
if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA
# KEY PART ---------------------------------------------------------------------
# Searches for non-unique peptides from in_file1
for line in range(len(in_list1)):
if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide,in_list1[line][2]) != None:
Unique = 'N'
Copy_ID_list.append(in_list[row][0])
Copy_list.append(Peptide)
break
if Unique == 'Y':
ID_list.append(in_list[row][0])
Peptide_list.append(Peptide)
# (repeated twice below) --------------------------------------------------------
Unique = 'Y' # Resets variable
# -------- One missed cleavage while-loop --------
Peptide_MC1 = Peptide
m = n
while MC1 == 'N' and m+1 <= len(PrEST_seq):
m += 1
if ((PrEST_seq[m:m+1] == 'R' or
PrEST_seq[m:m+1] == 'K') and
PrEST_seq[m+1:m+2] != 'P'):
Peptide_MC1 += PrEST_seq[m:m+1]
if len(Peptide_MC1) >= 6:
for line in range(len(in_list1)):
if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC1,in_list1[line][2]) != None:
Unique = 'N'
Copy_ID_list.append(in_list[row][0])
Copy_list.append(Peptide_MC1)
break
if Unique == 'Y':
ID_list.append(in_list[row][0])
Peptide_list.append(Peptide_MC1)
Unique = 'Y'
MC1 = 'Y'
else:
Peptide_MC1 += PrEST_seq[m:m+1]
# ------------- End MC1 while-loop ------------
# -------- Two missed cleavages while-loop --------
Peptide_MC2 = Peptide_MC1
k = m
while MC2 == 'N' and k+1 <= len(PrEST_seq):
k += 1
if ((PrEST_seq[k:k+1] == 'R' or
PrEST_seq[k:k+1] == 'K') and
PrEST_seq[k+1:k+2] != 'P'):
Peptide_MC2 += PrEST_seq[k:k+1]
if len(Peptide_MC2) >= 6:
for line in range(len(in_list1)):
if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC2,in_list1[line][2]) != None:
Unique = 'N'
Copy_ID_list.append(in_list[row][0])
Copy_list.append(Peptide_MC2)
break
if Unique == 'Y':
ID_list.append(in_list[row][0])
Peptide_list.append(Peptide_MC2)
Unique = 'Y'
MC2 = 'Y'
else:
Peptide_MC2 += PrEST_seq[k:k+1]
# ------------ End MC2 while-loop -------------
# Resets variables
Peptide = ''
Peptide_MC1 = ''
Peptide_MC2 = ''
MC1 = 'N'
MC2 = 'N'
elif First == 'Y': # Doesn't count first cleavage (contains ABP)
Peptide = ''
First = 'N'
else: # Non-cleavable AAs - Peptide grows
Peptide += PrEST_seq[n:n+1]
# ------- End main AA-reader for-loop --------
Peptide_list.sort(key=len, reverse=True) # Sorts list by length
for j in range(len(Peptide_list)): # Writes current PrEST to file
Collected_list = (ID_list[j],Peptide_list[j])
writer.writerow(Collected_list)
Peptide_list = []
ID_list = []
Copy_list.sort(key=len, reverse=True)
for j in range(len(Copy_list)):
Collected_list = (Copy_ID_list[j],Copy_list[j])
writer1.writerow(Collected_list)
Copy_list = []
Copy_ID_list = []
# ----- End main PrEST-reader for-loop -----
print('------- Finished -------')
print('Total time',time.clock()-t0,'seconds')
out_file.close()
out_file1.close()
我对Python和编程很新,我很确定我的代码缺乏多种方式。如果我不包括搜索大的.csv文件,它会很快,但我需要那部分。我不知道它在搜索部分是否会更快,或者在其他地方也可能更快。