优化BIG .csv文件的搜索

时间:2013-09-19 14:25:30

标签: python regex optimization csv

我正在编写一个Python脚本,它将从.csv文件中获取字符串并以特定方式剪切它们(“在没有P后跟着R和K后切割”),允许缺少最多两个切割位点和将生成的字符串写入新的.csv文件。这就像一个魅力,但...... ...

然后我需要这些字符串的每一个并搜索另一个.csv文件(大约有725000个条目)并查看字符串是否列在大文件中。如果是,则将它们写入单独的文件。我已成功完成此操作(请参阅下面的代码),但它速度超慢......我将大文件切换到大约2000个条目而不是725000,并且需要15秒(意味着整个文件大约需要90分钟)。这太慢了!如何减少这个计算时间?

import csv
import re
import time

# Input should be a .csv file with 2 columns (PrEST ID, PrEST Sequence)
in_file = open('Tryptic Sequences Input.csv','r')
in_file1 = open('Reference Peptides (ENSG, Björn) TEST.csv','r')
out_file = open('Tryptic Sequences Output.csv','w+')
out_file1 = open('Tryptic Sequences Output (non-unique peptides).csv','w+')

# Reader/Writer iterables
reader = csv.reader(in_file)
reader1 = csv.reader(in_file1)
in_list = list(reader)
in_list1 = list(reader1)
writer = csv.writer(out_file)
writer1 = csv.writer(out_file1)
headers = ('PrEST','Peptide')
writer.writerow(headers)
writer1.writerow(headers)

# Initiate variables
Peptide_list = [] # List for Peptides (resets for each PrEST)
ID_list = [] # List for PrEST IDs (resets for each PrEST)
Copy_list = [] # List for non-unique tryptic peptides
Copy_ID_list = []
Peptide = '' # Current peptide (no missed cleavages)
Peptide_MC1 = '' # Current peptide with 1 missed cleavage
Peptide_MC2 = '' # Current peptide with 2 missed cleavages
MC1 = 'N'
MC2 = 'N'
Unique = 'Y'

t0 = time.clock()

# ------ Main PrEST for-loop -------
for row in range(len(in_list)): # For every PrEST (row)
    First = 'Y'
    PrEST_seq = in_list[row][1]

    # -------- Main AA-reader for-loop --------
    for n in range(len(PrEST_seq)): # For every AA in every PrEST

        if ((PrEST_seq[n:n+1] == 'R' or
             PrEST_seq[n:n+1] == 'K') and
             PrEST_seq[n+1:n+2] != 'P'):
            if First != 'Y': # Does not count first peptide + MCs (part of ABP)
                Peptide += PrEST_seq[n:n+1]
                if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA

 # KEY PART ---------------------------------------------------------------------

                    # Searches for non-unique peptides from in_file1
                    for line in range(len(in_list1)):
                        if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide,in_list1[line][2]) != None:
                            Unique = 'N'
                            Copy_ID_list.append(in_list[row][0])
                            Copy_list.append(Peptide)
                            break
                    if Unique == 'Y':
                        ID_list.append(in_list[row][0])
                        Peptide_list.append(Peptide)

 # (repeated twice below) --------------------------------------------------------

                Unique = 'Y' # Resets variable

                # -------- One missed cleavage while-loop --------
                Peptide_MC1 = Peptide
                m = n
                while MC1 == 'N' and m+1 <= len(PrEST_seq):
                    m += 1
                    if ((PrEST_seq[m:m+1] == 'R' or
                         PrEST_seq[m:m+1] == 'K') and
                         PrEST_seq[m+1:m+2] != 'P'):
                        Peptide_MC1 += PrEST_seq[m:m+1]
                        if len(Peptide_MC1) >= 6:

                            for line in range(len(in_list1)):
                                if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC1,in_list1[line][2]) != None:
                                    Unique = 'N'
                                    Copy_ID_list.append(in_list[row][0])
                                    Copy_list.append(Peptide_MC1)
                                    break
                            if Unique == 'Y':
                                ID_list.append(in_list[row][0])
                                Peptide_list.append(Peptide_MC1)
                        Unique = 'Y'
                        MC1 = 'Y'
                    else:
                        Peptide_MC1 += PrEST_seq[m:m+1]
                    # ------------- End MC1 while-loop ------------

                # -------- Two missed cleavages while-loop --------
                Peptide_MC2 = Peptide_MC1
                k = m
                while MC2 == 'N' and k+1 <= len(PrEST_seq):
                    k += 1
                    if ((PrEST_seq[k:k+1] == 'R' or
                         PrEST_seq[k:k+1] == 'K') and
                         PrEST_seq[k+1:k+2] != 'P'):
                        Peptide_MC2 += PrEST_seq[k:k+1]
                        if len(Peptide_MC2) >= 6:

                            for line in range(len(in_list1)):
                                if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC2,in_list1[line][2]) != None:
                                    Unique = 'N'
                                    Copy_ID_list.append(in_list[row][0])
                                    Copy_list.append(Peptide_MC2)
                                    break
                            if Unique == 'Y':
                                ID_list.append(in_list[row][0])
                                Peptide_list.append(Peptide_MC2)
                        Unique = 'Y'
                        MC2 = 'Y'
                    else:
                        Peptide_MC2 += PrEST_seq[k:k+1]
                    # ------------ End MC2 while-loop -------------

                # Resets variables
                Peptide = ''
                Peptide_MC1 = ''
                Peptide_MC2 = ''
                MC1 = 'N'
                MC2 = 'N'
            elif First == 'Y': # Doesn't count first cleavage (contains ABP)
                Peptide = ''
                First = 'N'
        else: # Non-cleavable AAs - Peptide grows
            Peptide += PrEST_seq[n:n+1]

        # ------- End main AA-reader for-loop --------

    Peptide_list.sort(key=len, reverse=True) # Sorts list by length
    for j in range(len(Peptide_list)): # Writes current PrEST to file
        Collected_list = (ID_list[j],Peptide_list[j])
        writer.writerow(Collected_list)
    Peptide_list = []
    ID_list = []

    Copy_list.sort(key=len, reverse=True)
    for j in range(len(Copy_list)):
        Collected_list = (Copy_ID_list[j],Copy_list[j])
        writer1.writerow(Collected_list)
    Copy_list = []
    Copy_ID_list = []

    # ----- End main PrEST-reader for-loop -----      
print('------- Finished -------')
print('Total time',time.clock()-t0,'seconds')
out_file.close()
out_file1.close()

我对Python和编程很新,我很确定我的代码缺乏多种方式。如果我不包括搜索大的.csv文件,它会很快,但我需要那部分。我不知道它在搜索部分是否会更快,或者在其他地方也可能更快。

0 个答案:

没有答案