python中的奇怪错误,需要帮助调试

时间:2016-08-08 09:08:54

标签: python multiprocessing

我有一个更大的脚本,试图计算一些科学的东西。代码工作(在开始时),但经过一段时间后停止。错误不同。有时我得到Too many opened files错误,这很奇怪,因为我几乎不打开7-8个文件,而且我正在使用with open所以必须关闭它。有时在7-8-9迭代之后,脚本内存耗尽,这很奇怪,因为在之前的转弯中它几乎不使用5 GB,服务器有500 GB的内存。

代码是~500行单独广告,它看起来像这样:

import math
import multiprocessing
import os
import shutil
import subprocess
import sys
import time
import gc
import numpy as np
import pandas as pd
from scipy.optimize import minimize

from biopy import energy


class InnerCycle:
    def __init__(self, work_dir):
        self.CPU = multiprocessing.cpu_count() - 1
        self.ROSETTA_DIR = '/dlab/home/gerdos/bin/rosetta_src_2016.13.58602_bundle/main'
        self.INPUT_UNBOUND = "/dlab/data/analysis/STRUCTURES/LC8/5e0m_no_ligand.pdb"
        self.WORK_DIR = work_dir
        self.ENERGY_ATTRIBUTES = ['fa_atr', 'fa_rep', 'fa_sol', 'fa_elec', 'hbond_sr_bb', 'hbond_lr_bb', 'hbond_bb_sc', 'hbond_sc', 'rama', 'omega', 'fa_dun', 'p_aa_pp']
        self.PEPTIDE_KD = {
            # SPR meresek
            "RDGSTQTE": 1e-6,
            "EDKATQTL": 2e-6,
            "NTKMTQTP": 2e-6,
            "YNKEVQTV": 3e-6,
            "GDASTQTD": 7e-6,
            "QNKECQTE": 1.35e-6,
            # Chica
            "IDAATQTE": 4e-7,
            "SEVGTQTS": 6.3e-6,
            "ACAGTQTA": 1e-2,
            "RSTTTQTD": 5.2e-6,
            # PAK1
            "RDVATSPI": 4.2e-5,
            # Myosin 5
            # "DDKNTMTD": 8.8e-6
            # Tamas talalt
            "GDKSTQTD": 5.6e-7,
            "VDKSTQTD": 7e-8,
            "GSRGTQTE": 1.64e-6,
            "VSRGTQTE": 8e-8,
            "ICAGTQTD": 1.14e-6,
            "STTGTQCD": 12.8e-6,
            # Random peptides
            "EQKSNDAS": 1e-2,
            "GSAEIRAH": 1e-2,
            "RRQVKDNR": 1e-2,
            "IDTGITVS": 1e-2,
            "MELIWNLC": 1e-2,
            "CLDWMTMP": 1e-2,
            "LRQGLLPV": 1e-2,
            "NEAYPCPC": 1e-2,
            "NADNGTTV": 1e-2,
            "QALSQHVW": 1e-2,
            "VHTGCKDF": 1e-2,
            "PGVAPTLV": 1e-2,
            "PSIPISRQ": 1e-2,
            "PPGPEKVP": 1e-2,
            "KDFINFIS": 1e-2,
            "EILMQQFL": 1e-2,
            "ERAELQAG": 1e-2,
            "LGVASEDE": 1e-2,
            "SSVSTSNA": 1e-2,
            "GMDLDLAS": 1e-2,
            "VSMLSAIR": 1e-2,
            "GTYRCEAT": 1e-2,
            "PQNGGSSD": 1e-2,
            "RKAAVLIF": 1e-2,
            "QSTIPEHI": 1e-2,
            "LGLDKRQS": 1e-2,
            "FWDVVKRQ": 1e-2,
            "NHNQKCKE": 1e-2,
            "RCKFFSLT": 1e-2,
            "ASPRTVTI": 1e-2,
            # SPR 2. round
            "LDKQTQTP": 4e-6,
            "VDKETNTE": 8e-6,
            "VPKQTQTP": 5e-6,
            "YSKETQTP": 8e-6,
            "VDKAVQCS": 7e-6,
            "KDTGVQTD": 1e-5,
            "QHKECQTD": 9e-6,
            "RDQAVQTG": 9e-6
        }
        print("Creating the structures ...")
        # start = time.time()
        for _ in p.imap_unordered(self.main_func, self.PEPTIDE_KD.keys()):
            # print("Time elapsed on {} : {:.2f}min)".format(x, int(time.time() - start) / 60))
            pass
        print("Minimizing the domain...")
        proc = subprocess.Popen(
            '{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}/domain_minimized.silent -out:file:silent_struct_type binary -s {} -score:weights {}/weights.wts'.format(
                self.ROSETTA_DIR, self.WORK_DIR, self.INPUT_UNBOUND, self.WORK_DIR), shell=True,
            stdout=subprocess.PIPE)
        out, err = proc.communicate()
        if err:
            sys.exit(err.decode())
        print("Structures are ready!")
        print('Reading energies to calculate the partition functions ...')
        # Reading the energies parallel
        self.COMPLEX_ENERGY = {}
        self.LIGAND_ENERGY = {}
        self.DOMAIN_ENERGY = energy('{}/domain_minimized.silent'.format(self.WORK_DIR))

        for x in p.imap_unordered(self.read_energies, self.PEPTIDE_KD.keys()):
            self.COMPLEX_ENERGY[x[0]] = x[1]
            self.LIGAND_ENERGY[x[0]] = x[2]
        #
        print('Finished reading energies ...')
        print('Minimizing the energy weights ...')
        bound = [(0, None) for _ in self.ENERGY_ATTRIBUTES]
        start_points = [1 for _ in self.ENERGY_ATTRIBUTES]
        # start_points = [0.049454984778022501, 1.0811964401313245e-16, 0.019417178401552562, 0.74966259917739431, -9.8627227576030013e-18, 0.7190273821671308, 0.81862557829921956,
        #                 0.82527639772869144, 0.48343139163199061, 0.73645662699706216, 0.4707545503593048, 1.705196654171699]

        res = minimize(self.minimizing_func, np.array(start_points), method='SLSQP', bounds=bound, options={'disp': True})
        print()
        print('Minimization done!')
        print('Final weights:')
        print("--------")
        print(list(res.x))
        print("--------")
        print('PEPTIDE\tPredicted Kd\tExperimantal Kd\tLogDif')
        self.minimizing_func([1 for _ in self.ENERGY_ATTRIBUTES], prt=True)
        print()
        print('Energy weights:')
        self.ener_dict = {}
        for num, j in enumerate(self.ENERGY_ATTRIBUTES):
            print(j, '\t{:.4f}'.format(res.x[num]))
            self.ener_dict[j] = res.x[num]

    def get_result(self):
        return self.ener_dict

    def mutate_ligand(self, seq):
        # tm = time.time()
        if os.path.isfile("input.pdb"):
            return
        # print("Mutating the ligand to {} ...".format(seq))
        with open("resfile", "w") as file_name:
            file_name.write("NATRO # default command that applies to everything without a non- default setting; do not repack\nSTART\n")
            for num, j in enumerate(seq):
                file_name.write("{} L PIKAA {} # Change 468 pos in A chain to Alanine\n".format(num + 468, j))
            proc = subprocess.Popen(
                '{}/source/bin/fixbb.linuxgccrelease -s /dlab/data/analysis/STRUCTURES/LC8/5e0m_prep.pdb -nstruct 1 -ex1 -ex2 -resfile resfile {}.pdb'.format(self.ROSETTA_DIR, seq,
                                                                                                                                                              seq),
                shell=True, stdout=subprocess.PIPE)
            _, err = proc.communicate()
        if err:
            sys.exit(err)
        os.rename("5e0m_prep_0001.pdb", "input.pdb")
        # print("Mutation of {} done in {:.2f} minutes".format(seq, (time.time() - tm) / 60))
        return

    def prepack(self, seq, overwrite=False):
        # tm = time.time()
        if os.path.isfile("prepacked.pdb") and not overwrite:
            # print("Preparing is done!\n")
            return
        # print("Prepacking the structure ...")
        proc = subprocess.Popen(
            '{1}/source/bin/FlexPepDocking.linuxgccrelease -database {1}/database -s {2} -flexpep_prepack -ex1 -ex2 -unboundrot {3} -score:weights weights.wts'.format(
                self.CPU, self.ROSETTA_DIR, "input.pdb", self.INPUT_UNBOUND), shell=True, stdout=subprocess.PIPE)
        _, err = proc.communicate()
        if err:
            sys.exit(err.decode())
        os.rename("input_0001.pdb", "prepacked.pdb")
        # print("Preparing of {} done in {:.2f} minutes!".format(seq, (time.time() - tm) / 60))
        return

    def dock(self, seq, overwrite=False):
        # tm = time.time()
        if os.path.isfile("docking_results.silent") and not overwrite:
            # print("Docking is done!\n")
            return
        # print("Docking the prepared structure (prepacked.pdb) ...")
        proc = subprocess.Popen(
            '{1}/source/bin/FlexPepDocking.linuxgccrelease -database {1}/database -s {2} -ex1 -ex2 -out:file:silent {3}.silent -out:file:silent_struct_type binary -pep_refine -nstruct 100 -unboundrot {4} -score:weights weights.wts'.format(
                self.CPU, self.ROSETTA_DIR, "prepacked.pdb", "docking_results", self.INPUT_UNBOUND),
            shell=True, stdout=subprocess.PIPE)
        _, err = proc.communicate()
        if err:
            sys.exit(err.decode())
        # print("Docking of {} done in {:.2f} minutes!".format(seq, (time.time() - tm) / 60))
        return

    # def random_ligand_generation(self):
    #     tm = time.time()
    #     if not os.path.isfile("input_L.pdb"):
    #         proc = subprocess.Popen(
    #             'python /dlab/home/gerdos/pywork/STRUCTURE/chain_splitter.py input.pdb L',
    #             shell=True, stdout=subprocess.PIPE)
    #         proc.communicate()
    #     if not os.path.isfile("ligand.silent"):
    #         proc = subprocess.Popen(
    #             '{}/source/bin/FlexPepDocking.linuxgccrelease -s input_L.pdb -pep_fold_only -torsionsMCM -nstruct 100 --out:file:silent ligand.silent'.format(self.ROSETTA_DIR),
    #             shell=True, stdout=subprocess.PIPE)
    #         proc.communicate()
    #     print("Random ligand generation with FlexPepDock is done in {:.2f} minutes!".format((time.time() - tm) / 60))

    def ligand_prep_fm(self, seq):
        tm = time.time()
        # Check if every minimization is done
        trg = False
        for num in range(100):
            if not os.path.isfile("{}/{}/RANDOM_LIGANDS/{}.silent".format(self.WORK_DIR, seq, str(num) + "a_8_minimized")):
                trg = True
        if not trg:
            return

        # Generating the random peptides
        if not os.path.isdir("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}".format(seq)):
            shutil.copytree("/dlab/home/gerdos/bin/flexiblemeccano/simulations/sample", "/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}".format(seq))
            info_text = """Number_of_Amino_Acids                8
    Sequence_File                        /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Input/sequence.txt
    Contact_Restriction                  no
    Contacts_File                        /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Input/contacts.txt

    Number_of_Conformers                 100
    Output_Directory                     /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}
    J_Calculation                        no
    PRE_Calculation                      no
    RDC_Calculation                      yes

    Phi/Psi_Database                     /dlab/home/gerdos/bin/flexiblemeccano/database/database_options/phi_psi.txt

    Print_PDB                            yes

    J_Coupling                           - - -

    Number_of_Cysteines                  -
    Cysteine_Positions                   -
    Dynamic                              -
    Proton                               -
    Proton_Frequency                     - -
    Intensity                            -
    Intrinsic_Linewidth_Proton           -

    Global_Tensor                        yes
    N_HN                                 yes
    CA_HA                                yes
    CA_CO                                yes
    N_CO                                 yes
    HN_CO                                yes""".format(seq)

            sequence_text = ""
            for num, j in enumerate(list(seq)):
                sequence_text += "{}     {}  0     0.00     0.00  0.00    0.00\n".format(num + 1, j)
            with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Input/info.txt".format(seq), "w") as file_name:
                file_name.write(info_text)
            with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Input/sequence.txt".format(seq), "w") as file_name:
                file_name.write(sequence_text)
            proc = subprocess.Popen(
                '/dlab/home/gerdos/bin/flexiblemeccano/applications/flexible_meccano ~/bin/flexiblemeccano/simulations/{}/ /usr/bin/gnuplot'.format(seq), shell=True,
                stdout=subprocess.PIPE)
            res, err = proc.communicate()
            if err:
                sys.exit(err.decode())
            # PDB files generated by FM have a mistake, correcting it
            for num in range(100):
                with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Output/PDB/{}a_8.pdb".format(seq, num), "r") as file_name:
                    pseu_str = file_name.readlines()
                for j, line in enumerate(pseu_str):
                    if line.startswith("ATOM"):
                        if int(line.split()[1]) >= 54:
                            line = list(line)
                            line.insert(11, " ")
                    pseu_str[j] = "".join(line)
                with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Output/PDB/{}a_8.pdb".format(seq, num), "w") as file_name:
                    file_name.write("".join(pseu_str))
        if not os.path.isdir('{}/{}/RANDOM_LIGANDS'.format(self.WORK_DIR, seq)):
            shutil.copytree("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Output/PDB".format(seq), "{}/{}/RANDOM_LIGANDS".format(self.WORK_DIR, seq))
        # Simulation
        for num in range(100):
            if not os.path.isfile("{}/{}/RANDOM_LIGANDS/{}.silent".format(self.WORK_DIR, seq, str(num) + "a_8_minimized")):
                proc = subprocess.Popen(
                    '{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}/{}/RANDOM_LIGANDS/{}.silent -out:file:silent_struct_type binary -s {}/{}/RANDOM_LIGANDS/{} -score:weights weights.wts'.format(
                        self.ROSETTA_DIR, self.WORK_DIR, seq, str(num) + "a_8_minimized", self.WORK_DIR, seq, str(num) + "a_8.pdb"), shell=True, stdout=subprocess.PIPE)
                out, err = proc.communicate()
                if err:
                    sys.exit(err.decode())
        # print("Random ligand generation with FlexibleMeccano is done in {:.2f} minutes!".format((time.time() - tm) / 60))
        return

    def complex_minimization(self):
        # extract the docked PDBs
        if not os.path.isdir('COMPLEX_STRUCTURES'):
            os.mkdir('COMPLEX_STRUCTURES')
        os.chdir('COMPLEX_STRUCTURES')
        shutil.copy('../weights.wts', '.')
        # Check id PDBs have been extracted before
        tgl = False
        for num in range(100):
            if not os.path.isfile('prepacked_{:04d}.pdb'.format(num + 1)):
                tgl = True
        if tgl:
            proc = subprocess.Popen(
                '{}/source/bin/extract_pdbs.linuxgccrelease -in:file:silent ../docking_results.silent'.format(
                    self.ROSETTA_DIR), shell=True, stdout=subprocess.PIPE)
            proc.communicate()
        # Minimizing the complexes
        for num in range(100):
            if not os.path.isfile("{}.silent".format('prepacked_{:04d}_minimized'.format(num + 1))):
                proc = subprocess.Popen(
                    '{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}.silent -out:file:silent_struct_type binary -s {} -score:weights weights.wts'.format(
                        self.ROSETTA_DIR, 'prepacked_{:04d}_minimized'.format(num + 1), 'prepacked_{:04d}.pdb'.format(num + 1)), shell=True, stdout=subprocess.PIPE)
                out, err = proc.communicate()
                if err:
                    sys.exit(err.decode())

    def read_energies(self, motif):
        # tm = time.time()
        os.chdir("{}/{}".format(self.WORK_DIR, motif))
        complx_ener = pd.DataFrame()
        for num in range(100):
            complx_ener = complx_ener.append(energy('COMPLEX_STRUCTURES/prepacked_{:04d}_minimized.silent'.format(num + 1))[self.ENERGY_ATTRIBUTES])
        lig_ener = pd.DataFrame()
        for num in range(100):
            lig_ener = lig_ener.append(energy('RANDOM_LIGANDS/{}.silent'.format(str(num) + "a_8_minimized"))[self.ENERGY_ATTRIBUTES])
        # print("Read energies for {} in {:.2f} minutes!".format(motif, (time.time() - tm) / 60))
        return motif, complx_ener, lig_ener

    @staticmethod
    def sigmoid(x):
        # return x
        # return math.log(x, math.e)
        return -math.tanh(0.15 * (math.log(x, math.e) - math.log(1e-4, math.e)))

    def minimizing_func(self, weights_lst, prt=False):
        # if not prt:
        #     print(weights_lst)
        res = 0
        # Energy DataFarmes should not me modified globally
        domain_ener = self.DOMAIN_ENERGY.copy()

        # Domain partial function calculation, independent of peptide
        domain_partial_function = np.float128()
        for num, j in enumerate(self.ENERGY_ATTRIBUTES):
            domain_ener.loc[:, j] *= weights_lst[num]
        for num, j in domain_ener.iterrows():
            domain_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
        # Ligand and complex partial functions are depending on the peptide
        for inner_key, kd_experimental in self.PEPTIDE_KD.items():
            # Copy the pd object, as it is immutable
            complex_ener = self.COMPLEX_ENERGY[inner_key].copy(deep=True)
            ligand_ener = self.LIGAND_ENERGY[inner_key].copy(deep=True)
            # Weighting the energies
            for num, j in enumerate(self.ENERGY_ATTRIBUTES):
                complex_ener.loc[:, j] *= weights_lst[num]
                ligand_ener.loc[:, j] *= weights_lst[num]
            # Complex partial function
            complex_partial_function = np.float128()
            for num, j in complex_ener.iterrows():
                complex_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
            # Ligand partial function
            ligand_partial_function = np.float128()
            for num, j in ligand_ener.iterrows():
                ligand_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
            try:
                res += (self.sigmoid(ligand_partial_function * domain_partial_function / complex_partial_function) - self.sigmoid(kd_experimental)) ** 2
            except ValueError:
                return np.NaN
            if prt:
                print('{}\t{:.4e}\t{:.4e}\t{:.4e}'.format(inner_key, ligand_partial_function * domain_partial_function / complex_partial_function, kd_experimental,
                                                          abs(math.log(ligand_partial_function * domain_partial_function / complex_partial_function, 10) - math.log(kd_experimental,
                                                                                                                                                                    10))))

        print('\rDifference: {:.4f}'.format(res),end='')
        return res

    def main_func(self, seq):
        # Creating the directory isf not exists, and go in there
        if not os.path.isdir("{}/{}".format(self.WORK_DIR, seq)):
            os.mkdir("{}/{}".format(self.WORK_DIR, seq))
        os.chdir("{}/{}".format(self.WORK_DIR, seq))
        shutil.copy('{}/weights.wts'.format(self.WORK_DIR), '.')
        self.mutate_ligand(seq)
        self.prepack(seq)
        self.dock(seq)

        # Ligand generation with FlexPepDock
        # random_ligand_generation()

        # Ligand generation with FlexibleMeccano
        self.ligand_prep_fm(seq)
        self.complex_minimization()
        return seq

        # def plotData(var, x, y, color1, u, v, color2):
        #     pearR = np.corrcoef(x, y)[1, 0]
        #     # least squares from:
        #     # http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html
        #     A = np.vstack([x, np.ones(len(x))]).T
        #     m, c = np.linalg.lstsq(A, np.array(y))[0]
        #     plt.figure(1)
        #     plt.subplot(211)
        #     # plt.xlim([min(x) * 1.1, max(x) * 0.9])
        #     plt.ylim([-8, -2])
        #     plt.scatter(x, y, label='Regi', color=color1)
        #     plt.legend(loc=2)
        #     plt.xlabel("log10(Kd)")
        #     plt.ylabel(var)
        #     plt.subplot(212)
        #     plt.scatter(u, v, label='Uj', color=color2)
        #     # plt.plot(x, x * m + c, color=color,
        #     #          label="R = %6.2e" % (pearR))
        #     plt.xlabel("log10(Kd)")
        #     plt.ylabel(var)
        #     # plt.xlim([min(x) * 1.1, max(x) * 0.9])
        #     plt.ylim([-8, -2])
        #     plt.legend(loc=2)
        #     plt.show()


INITIAL_WEIGHTS = {
    'fa_atr': 1,
    'fa_rep': 0.55,
    'fa_sol': 0.9375,
    'fa_intra_rep': 0.005,
    'fa_elec': 0.875,
    'pro_close': 0,
    'hbond_sr_bb': 1.17,
    'hbond_lr_bb': 1.17,
    'hbond_bb_sc': 1.17,
    'hbond_sc': 1.1,
    'dslf_fa13': 0,
    'rama': 0.25,
    'omega': 0.625,
    'fa_dun': 0.7,
    'p_aa_pp': 0.4,
    'yhh_planarity': 0,
    'ref': 0
}
new_energies = INITIAL_WEIGHTS
p = multiprocessing.Pool(processes=multiprocessing.cpu_count()-1)
print('Found {} cores to use'.format(multiprocessing.cpu_count() - 1))


START_POINT = 7
if os.path.isfile(('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(START_POINT - 1))):
    with open('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(START_POINT - 1)) as fh:
        for line in fh:
            new_energies[line.split()[0]] = float(line.split()[1].strip())

for i in range(START_POINT, 50):

    try:
        os.mkdir('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}'.format(i))
    except FileExistsError:
        pass
    print('Running the inner cycle with weights:')
    for key, val in sorted(new_energies.items()):
        print('{}\t{}'.format(key, val))
    print('----------------')
    if not os.path.isfile('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(i)):
        with open('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(i), 'w') as fn:
            for key, val in new_energies.items():
                fn.write("{} {}".format(key, val))
                fn.write('\n')
    returned_ener = InnerCycle('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}'.format(i)).get_result()
    for key, val in returned_ener.items():
        new_energies[key] = (1 / (i + 1) * returned_ener[key]) + (new_energies[key] * (1 - (1 / (i + 1))))
    print('----------------')
    print('Cycle {} complete'.format(i))
    print('----------------')
    gc.collect()

我知道它很长,而且大多数人都没有时间来调试我的失败。我是一名生物学家,而且我还没有很好的编程技巧。

我不明白为什么我的内存不足。我该如何调试这样的错误?

简而言之,代码运行如下:

  1. 进行模拟(使用multiprocessing
  2. 加权结果 模拟(数据收集是并行完成的 与multiprocessing
  3. 使用scipy
  4. 最小化权重
  5. 使用给出最小化结果运行模拟,然后继续 2.步骤
  6. 谢谢!

0 个答案:

没有答案