我有一个更大的脚本,试图计算一些科学的东西。代码工作(在开始时),但经过一段时间后停止。错误不同。有时我得到Too many opened files
错误,这很奇怪,因为我几乎不打开7-8个文件,而且我正在使用with open
所以必须关闭它。有时在7-8-9迭代之后,脚本内存耗尽,这很奇怪,因为在之前的转弯中它几乎不使用5 GB,服务器有500 GB的内存。
代码是~500行单独广告,它看起来像这样:
import math
import multiprocessing
import os
import shutil
import subprocess
import sys
import time
import gc
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from biopy import energy
class InnerCycle:
def __init__(self, work_dir):
self.CPU = multiprocessing.cpu_count() - 1
self.ROSETTA_DIR = '/dlab/home/gerdos/bin/rosetta_src_2016.13.58602_bundle/main'
self.INPUT_UNBOUND = "/dlab/data/analysis/STRUCTURES/LC8/5e0m_no_ligand.pdb"
self.WORK_DIR = work_dir
self.ENERGY_ATTRIBUTES = ['fa_atr', 'fa_rep', 'fa_sol', 'fa_elec', 'hbond_sr_bb', 'hbond_lr_bb', 'hbond_bb_sc', 'hbond_sc', 'rama', 'omega', 'fa_dun', 'p_aa_pp']
self.PEPTIDE_KD = {
# SPR meresek
"RDGSTQTE": 1e-6,
"EDKATQTL": 2e-6,
"NTKMTQTP": 2e-6,
"YNKEVQTV": 3e-6,
"GDASTQTD": 7e-6,
"QNKECQTE": 1.35e-6,
# Chica
"IDAATQTE": 4e-7,
"SEVGTQTS": 6.3e-6,
"ACAGTQTA": 1e-2,
"RSTTTQTD": 5.2e-6,
# PAK1
"RDVATSPI": 4.2e-5,
# Myosin 5
# "DDKNTMTD": 8.8e-6
# Tamas talalt
"GDKSTQTD": 5.6e-7,
"VDKSTQTD": 7e-8,
"GSRGTQTE": 1.64e-6,
"VSRGTQTE": 8e-8,
"ICAGTQTD": 1.14e-6,
"STTGTQCD": 12.8e-6,
# Random peptides
"EQKSNDAS": 1e-2,
"GSAEIRAH": 1e-2,
"RRQVKDNR": 1e-2,
"IDTGITVS": 1e-2,
"MELIWNLC": 1e-2,
"CLDWMTMP": 1e-2,
"LRQGLLPV": 1e-2,
"NEAYPCPC": 1e-2,
"NADNGTTV": 1e-2,
"QALSQHVW": 1e-2,
"VHTGCKDF": 1e-2,
"PGVAPTLV": 1e-2,
"PSIPISRQ": 1e-2,
"PPGPEKVP": 1e-2,
"KDFINFIS": 1e-2,
"EILMQQFL": 1e-2,
"ERAELQAG": 1e-2,
"LGVASEDE": 1e-2,
"SSVSTSNA": 1e-2,
"GMDLDLAS": 1e-2,
"VSMLSAIR": 1e-2,
"GTYRCEAT": 1e-2,
"PQNGGSSD": 1e-2,
"RKAAVLIF": 1e-2,
"QSTIPEHI": 1e-2,
"LGLDKRQS": 1e-2,
"FWDVVKRQ": 1e-2,
"NHNQKCKE": 1e-2,
"RCKFFSLT": 1e-2,
"ASPRTVTI": 1e-2,
# SPR 2. round
"LDKQTQTP": 4e-6,
"VDKETNTE": 8e-6,
"VPKQTQTP": 5e-6,
"YSKETQTP": 8e-6,
"VDKAVQCS": 7e-6,
"KDTGVQTD": 1e-5,
"QHKECQTD": 9e-6,
"RDQAVQTG": 9e-6
}
print("Creating the structures ...")
# start = time.time()
for _ in p.imap_unordered(self.main_func, self.PEPTIDE_KD.keys()):
# print("Time elapsed on {} : {:.2f}min)".format(x, int(time.time() - start) / 60))
pass
print("Minimizing the domain...")
proc = subprocess.Popen(
'{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}/domain_minimized.silent -out:file:silent_struct_type binary -s {} -score:weights {}/weights.wts'.format(
self.ROSETTA_DIR, self.WORK_DIR, self.INPUT_UNBOUND, self.WORK_DIR), shell=True,
stdout=subprocess.PIPE)
out, err = proc.communicate()
if err:
sys.exit(err.decode())
print("Structures are ready!")
print('Reading energies to calculate the partition functions ...')
# Reading the energies parallel
self.COMPLEX_ENERGY = {}
self.LIGAND_ENERGY = {}
self.DOMAIN_ENERGY = energy('{}/domain_minimized.silent'.format(self.WORK_DIR))
for x in p.imap_unordered(self.read_energies, self.PEPTIDE_KD.keys()):
self.COMPLEX_ENERGY[x[0]] = x[1]
self.LIGAND_ENERGY[x[0]] = x[2]
#
print('Finished reading energies ...')
print('Minimizing the energy weights ...')
bound = [(0, None) for _ in self.ENERGY_ATTRIBUTES]
start_points = [1 for _ in self.ENERGY_ATTRIBUTES]
# start_points = [0.049454984778022501, 1.0811964401313245e-16, 0.019417178401552562, 0.74966259917739431, -9.8627227576030013e-18, 0.7190273821671308, 0.81862557829921956,
# 0.82527639772869144, 0.48343139163199061, 0.73645662699706216, 0.4707545503593048, 1.705196654171699]
res = minimize(self.minimizing_func, np.array(start_points), method='SLSQP', bounds=bound, options={'disp': True})
print()
print('Minimization done!')
print('Final weights:')
print("--------")
print(list(res.x))
print("--------")
print('PEPTIDE\tPredicted Kd\tExperimantal Kd\tLogDif')
self.minimizing_func([1 for _ in self.ENERGY_ATTRIBUTES], prt=True)
print()
print('Energy weights:')
self.ener_dict = {}
for num, j in enumerate(self.ENERGY_ATTRIBUTES):
print(j, '\t{:.4f}'.format(res.x[num]))
self.ener_dict[j] = res.x[num]
def get_result(self):
return self.ener_dict
def mutate_ligand(self, seq):
# tm = time.time()
if os.path.isfile("input.pdb"):
return
# print("Mutating the ligand to {} ...".format(seq))
with open("resfile", "w") as file_name:
file_name.write("NATRO # default command that applies to everything without a non- default setting; do not repack\nSTART\n")
for num, j in enumerate(seq):
file_name.write("{} L PIKAA {} # Change 468 pos in A chain to Alanine\n".format(num + 468, j))
proc = subprocess.Popen(
'{}/source/bin/fixbb.linuxgccrelease -s /dlab/data/analysis/STRUCTURES/LC8/5e0m_prep.pdb -nstruct 1 -ex1 -ex2 -resfile resfile {}.pdb'.format(self.ROSETTA_DIR, seq,
seq),
shell=True, stdout=subprocess.PIPE)
_, err = proc.communicate()
if err:
sys.exit(err)
os.rename("5e0m_prep_0001.pdb", "input.pdb")
# print("Mutation of {} done in {:.2f} minutes".format(seq, (time.time() - tm) / 60))
return
def prepack(self, seq, overwrite=False):
# tm = time.time()
if os.path.isfile("prepacked.pdb") and not overwrite:
# print("Preparing is done!\n")
return
# print("Prepacking the structure ...")
proc = subprocess.Popen(
'{1}/source/bin/FlexPepDocking.linuxgccrelease -database {1}/database -s {2} -flexpep_prepack -ex1 -ex2 -unboundrot {3} -score:weights weights.wts'.format(
self.CPU, self.ROSETTA_DIR, "input.pdb", self.INPUT_UNBOUND), shell=True, stdout=subprocess.PIPE)
_, err = proc.communicate()
if err:
sys.exit(err.decode())
os.rename("input_0001.pdb", "prepacked.pdb")
# print("Preparing of {} done in {:.2f} minutes!".format(seq, (time.time() - tm) / 60))
return
def dock(self, seq, overwrite=False):
# tm = time.time()
if os.path.isfile("docking_results.silent") and not overwrite:
# print("Docking is done!\n")
return
# print("Docking the prepared structure (prepacked.pdb) ...")
proc = subprocess.Popen(
'{1}/source/bin/FlexPepDocking.linuxgccrelease -database {1}/database -s {2} -ex1 -ex2 -out:file:silent {3}.silent -out:file:silent_struct_type binary -pep_refine -nstruct 100 -unboundrot {4} -score:weights weights.wts'.format(
self.CPU, self.ROSETTA_DIR, "prepacked.pdb", "docking_results", self.INPUT_UNBOUND),
shell=True, stdout=subprocess.PIPE)
_, err = proc.communicate()
if err:
sys.exit(err.decode())
# print("Docking of {} done in {:.2f} minutes!".format(seq, (time.time() - tm) / 60))
return
# def random_ligand_generation(self):
# tm = time.time()
# if not os.path.isfile("input_L.pdb"):
# proc = subprocess.Popen(
# 'python /dlab/home/gerdos/pywork/STRUCTURE/chain_splitter.py input.pdb L',
# shell=True, stdout=subprocess.PIPE)
# proc.communicate()
# if not os.path.isfile("ligand.silent"):
# proc = subprocess.Popen(
# '{}/source/bin/FlexPepDocking.linuxgccrelease -s input_L.pdb -pep_fold_only -torsionsMCM -nstruct 100 --out:file:silent ligand.silent'.format(self.ROSETTA_DIR),
# shell=True, stdout=subprocess.PIPE)
# proc.communicate()
# print("Random ligand generation with FlexPepDock is done in {:.2f} minutes!".format((time.time() - tm) / 60))
def ligand_prep_fm(self, seq):
tm = time.time()
# Check if every minimization is done
trg = False
for num in range(100):
if not os.path.isfile("{}/{}/RANDOM_LIGANDS/{}.silent".format(self.WORK_DIR, seq, str(num) + "a_8_minimized")):
trg = True
if not trg:
return
# Generating the random peptides
if not os.path.isdir("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}".format(seq)):
shutil.copytree("/dlab/home/gerdos/bin/flexiblemeccano/simulations/sample", "/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}".format(seq))
info_text = """Number_of_Amino_Acids 8
Sequence_File /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Input/sequence.txt
Contact_Restriction no
Contacts_File /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Input/contacts.txt
Number_of_Conformers 100
Output_Directory /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}
J_Calculation no
PRE_Calculation no
RDC_Calculation yes
Phi/Psi_Database /dlab/home/gerdos/bin/flexiblemeccano/database/database_options/phi_psi.txt
Print_PDB yes
J_Coupling - - -
Number_of_Cysteines -
Cysteine_Positions -
Dynamic -
Proton -
Proton_Frequency - -
Intensity -
Intrinsic_Linewidth_Proton -
Global_Tensor yes
N_HN yes
CA_HA yes
CA_CO yes
N_CO yes
HN_CO yes""".format(seq)
sequence_text = ""
for num, j in enumerate(list(seq)):
sequence_text += "{} {} 0 0.00 0.00 0.00 0.00\n".format(num + 1, j)
with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Input/info.txt".format(seq), "w") as file_name:
file_name.write(info_text)
with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Input/sequence.txt".format(seq), "w") as file_name:
file_name.write(sequence_text)
proc = subprocess.Popen(
'/dlab/home/gerdos/bin/flexiblemeccano/applications/flexible_meccano ~/bin/flexiblemeccano/simulations/{}/ /usr/bin/gnuplot'.format(seq), shell=True,
stdout=subprocess.PIPE)
res, err = proc.communicate()
if err:
sys.exit(err.decode())
# PDB files generated by FM have a mistake, correcting it
for num in range(100):
with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Output/PDB/{}a_8.pdb".format(seq, num), "r") as file_name:
pseu_str = file_name.readlines()
for j, line in enumerate(pseu_str):
if line.startswith("ATOM"):
if int(line.split()[1]) >= 54:
line = list(line)
line.insert(11, " ")
pseu_str[j] = "".join(line)
with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Output/PDB/{}a_8.pdb".format(seq, num), "w") as file_name:
file_name.write("".join(pseu_str))
if not os.path.isdir('{}/{}/RANDOM_LIGANDS'.format(self.WORK_DIR, seq)):
shutil.copytree("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Output/PDB".format(seq), "{}/{}/RANDOM_LIGANDS".format(self.WORK_DIR, seq))
# Simulation
for num in range(100):
if not os.path.isfile("{}/{}/RANDOM_LIGANDS/{}.silent".format(self.WORK_DIR, seq, str(num) + "a_8_minimized")):
proc = subprocess.Popen(
'{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}/{}/RANDOM_LIGANDS/{}.silent -out:file:silent_struct_type binary -s {}/{}/RANDOM_LIGANDS/{} -score:weights weights.wts'.format(
self.ROSETTA_DIR, self.WORK_DIR, seq, str(num) + "a_8_minimized", self.WORK_DIR, seq, str(num) + "a_8.pdb"), shell=True, stdout=subprocess.PIPE)
out, err = proc.communicate()
if err:
sys.exit(err.decode())
# print("Random ligand generation with FlexibleMeccano is done in {:.2f} minutes!".format((time.time() - tm) / 60))
return
def complex_minimization(self):
# extract the docked PDBs
if not os.path.isdir('COMPLEX_STRUCTURES'):
os.mkdir('COMPLEX_STRUCTURES')
os.chdir('COMPLEX_STRUCTURES')
shutil.copy('../weights.wts', '.')
# Check id PDBs have been extracted before
tgl = False
for num in range(100):
if not os.path.isfile('prepacked_{:04d}.pdb'.format(num + 1)):
tgl = True
if tgl:
proc = subprocess.Popen(
'{}/source/bin/extract_pdbs.linuxgccrelease -in:file:silent ../docking_results.silent'.format(
self.ROSETTA_DIR), shell=True, stdout=subprocess.PIPE)
proc.communicate()
# Minimizing the complexes
for num in range(100):
if not os.path.isfile("{}.silent".format('prepacked_{:04d}_minimized'.format(num + 1))):
proc = subprocess.Popen(
'{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}.silent -out:file:silent_struct_type binary -s {} -score:weights weights.wts'.format(
self.ROSETTA_DIR, 'prepacked_{:04d}_minimized'.format(num + 1), 'prepacked_{:04d}.pdb'.format(num + 1)), shell=True, stdout=subprocess.PIPE)
out, err = proc.communicate()
if err:
sys.exit(err.decode())
def read_energies(self, motif):
# tm = time.time()
os.chdir("{}/{}".format(self.WORK_DIR, motif))
complx_ener = pd.DataFrame()
for num in range(100):
complx_ener = complx_ener.append(energy('COMPLEX_STRUCTURES/prepacked_{:04d}_minimized.silent'.format(num + 1))[self.ENERGY_ATTRIBUTES])
lig_ener = pd.DataFrame()
for num in range(100):
lig_ener = lig_ener.append(energy('RANDOM_LIGANDS/{}.silent'.format(str(num) + "a_8_minimized"))[self.ENERGY_ATTRIBUTES])
# print("Read energies for {} in {:.2f} minutes!".format(motif, (time.time() - tm) / 60))
return motif, complx_ener, lig_ener
@staticmethod
def sigmoid(x):
# return x
# return math.log(x, math.e)
return -math.tanh(0.15 * (math.log(x, math.e) - math.log(1e-4, math.e)))
def minimizing_func(self, weights_lst, prt=False):
# if not prt:
# print(weights_lst)
res = 0
# Energy DataFarmes should not me modified globally
domain_ener = self.DOMAIN_ENERGY.copy()
# Domain partial function calculation, independent of peptide
domain_partial_function = np.float128()
for num, j in enumerate(self.ENERGY_ATTRIBUTES):
domain_ener.loc[:, j] *= weights_lst[num]
for num, j in domain_ener.iterrows():
domain_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
# Ligand and complex partial functions are depending on the peptide
for inner_key, kd_experimental in self.PEPTIDE_KD.items():
# Copy the pd object, as it is immutable
complex_ener = self.COMPLEX_ENERGY[inner_key].copy(deep=True)
ligand_ener = self.LIGAND_ENERGY[inner_key].copy(deep=True)
# Weighting the energies
for num, j in enumerate(self.ENERGY_ATTRIBUTES):
complex_ener.loc[:, j] *= weights_lst[num]
ligand_ener.loc[:, j] *= weights_lst[num]
# Complex partial function
complex_partial_function = np.float128()
for num, j in complex_ener.iterrows():
complex_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
# Ligand partial function
ligand_partial_function = np.float128()
for num, j in ligand_ener.iterrows():
ligand_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
try:
res += (self.sigmoid(ligand_partial_function * domain_partial_function / complex_partial_function) - self.sigmoid(kd_experimental)) ** 2
except ValueError:
return np.NaN
if prt:
print('{}\t{:.4e}\t{:.4e}\t{:.4e}'.format(inner_key, ligand_partial_function * domain_partial_function / complex_partial_function, kd_experimental,
abs(math.log(ligand_partial_function * domain_partial_function / complex_partial_function, 10) - math.log(kd_experimental,
10))))
print('\rDifference: {:.4f}'.format(res),end='')
return res
def main_func(self, seq):
# Creating the directory isf not exists, and go in there
if not os.path.isdir("{}/{}".format(self.WORK_DIR, seq)):
os.mkdir("{}/{}".format(self.WORK_DIR, seq))
os.chdir("{}/{}".format(self.WORK_DIR, seq))
shutil.copy('{}/weights.wts'.format(self.WORK_DIR), '.')
self.mutate_ligand(seq)
self.prepack(seq)
self.dock(seq)
# Ligand generation with FlexPepDock
# random_ligand_generation()
# Ligand generation with FlexibleMeccano
self.ligand_prep_fm(seq)
self.complex_minimization()
return seq
# def plotData(var, x, y, color1, u, v, color2):
# pearR = np.corrcoef(x, y)[1, 0]
# # least squares from:
# # http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html
# A = np.vstack([x, np.ones(len(x))]).T
# m, c = np.linalg.lstsq(A, np.array(y))[0]
# plt.figure(1)
# plt.subplot(211)
# # plt.xlim([min(x) * 1.1, max(x) * 0.9])
# plt.ylim([-8, -2])
# plt.scatter(x, y, label='Regi', color=color1)
# plt.legend(loc=2)
# plt.xlabel("log10(Kd)")
# plt.ylabel(var)
# plt.subplot(212)
# plt.scatter(u, v, label='Uj', color=color2)
# # plt.plot(x, x * m + c, color=color,
# # label="R = %6.2e" % (pearR))
# plt.xlabel("log10(Kd)")
# plt.ylabel(var)
# # plt.xlim([min(x) * 1.1, max(x) * 0.9])
# plt.ylim([-8, -2])
# plt.legend(loc=2)
# plt.show()
INITIAL_WEIGHTS = {
'fa_atr': 1,
'fa_rep': 0.55,
'fa_sol': 0.9375,
'fa_intra_rep': 0.005,
'fa_elec': 0.875,
'pro_close': 0,
'hbond_sr_bb': 1.17,
'hbond_lr_bb': 1.17,
'hbond_bb_sc': 1.17,
'hbond_sc': 1.1,
'dslf_fa13': 0,
'rama': 0.25,
'omega': 0.625,
'fa_dun': 0.7,
'p_aa_pp': 0.4,
'yhh_planarity': 0,
'ref': 0
}
new_energies = INITIAL_WEIGHTS
p = multiprocessing.Pool(processes=multiprocessing.cpu_count()-1)
print('Found {} cores to use'.format(multiprocessing.cpu_count() - 1))
START_POINT = 7
if os.path.isfile(('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(START_POINT - 1))):
with open('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(START_POINT - 1)) as fh:
for line in fh:
new_energies[line.split()[0]] = float(line.split()[1].strip())
for i in range(START_POINT, 50):
try:
os.mkdir('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}'.format(i))
except FileExistsError:
pass
print('Running the inner cycle with weights:')
for key, val in sorted(new_energies.items()):
print('{}\t{}'.format(key, val))
print('----------------')
if not os.path.isfile('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(i)):
with open('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(i), 'w') as fn:
for key, val in new_energies.items():
fn.write("{} {}".format(key, val))
fn.write('\n')
returned_ener = InnerCycle('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}'.format(i)).get_result()
for key, val in returned_ener.items():
new_energies[key] = (1 / (i + 1) * returned_ener[key]) + (new_energies[key] * (1 - (1 / (i + 1))))
print('----------------')
print('Cycle {} complete'.format(i))
print('----------------')
gc.collect()
我知道它很长,而且大多数人都没有时间来调试我的失败。我是一名生物学家,而且我还没有很好的编程技巧。
我不明白为什么我的内存不足。我该如何调试这样的错误?
简而言之,代码运行如下:
multiprocessing
)multiprocessing
)scipy
谢谢!