如何加速.py脚本引用数百万个文件?

时间:2016-02-07 21:27:45

标签: python bash

我有大约2.5M文件要使用.py脚本处理。

我使用的是超级计算器,但问题不在于它的功能,它的python进程本身每次都会打开和关闭,并且会耗费时间。

我使用循环来获取我想用脚本转换的文件夹中的每个文件。所以$ {line}指的是一个文件,其中每一行都指的是文件夹的每个文件。

有没有办法在打开.py脚本而不是循环python脚本后处理所有文件?

有我的循环代码:

### LOOP ###
while :
do
pythonsh ${RAMDISK}/script.py -l ${RAMDISK}/${line}.pdb -U '' -A hydrogens

done

exit

python脚本只是一个将.pdb转换为.ddb文件的工具,我从Autodock4附带的AutodockTools中找到了这些文件。

1 个答案:

答案 0 :(得分:1)

我修改了脚本,添加了-i命令行选项。

这将允许您指定包含配体文件名(每行一个)的文本文件,并在不重新启动Python的情况下处理它们。

您现在应该可以将其称为

pythonsh ./newscript.py -i ./list_of_files.txt -U '' -A hydrogens

注意:这是未经测试的!它应该按照给定的方式工作,但要非常小心!

#!/usr/bin/env python
#
# 
#
# $Header: /opt/cvs/python/packages/share1.5/AutoDockTools/Utilities24/prepare_ligand4.py,v 1.5.4.1 2009/04/15 17:41:57 rhuey Exp $
#
# Modified 2016/02/07
# Hugh Bothwell  http://stackoverflow.com/users/33258
# Added -i commandline option to process multiple files
#
import os 

from MolKit import Read
from AutoDockTools.MoleculePreparation import AD4LigandPreparation

# initialize command-line parameters
#-l: ligand
ligand_filename =  None
#-i: file containing ligand-filenames
ligand_listfile =  None
# optional parameters
verbose = None
add_bonds = False
#-A: repairs to make: add bonds and/or hydrogens
repairs = ""
#-C  default: add gasteiger charges 
charges_to_add = 'gasteiger'
#-p preserve charges on specific atom types
preserve_charge_types=''
#-U: cleanup by merging nphs_lps, nphs, lps
cleanup  = "nphs_lps"
#-B named rotatable bond type(s) to allow to rotate
#allowed_bonds = ""
allowed_bonds = "backbone"
#-r  root
root = 'auto'
#-o outputfilename
outputfilename = None
#-F check_for_fragments
check_for_fragments = False
#-I bonds_to_inactivate
bonds_to_inactivate = ""
#-Z inactivate_all_torsions
inactivate_all_torsions = False
#-g attach_nonbonded_fragments
attach_nonbonded_fragments = False
#-m mode 
mode = 'automatic'
#-d dictionary
dict = None

def process_file(fname):
    mols = Read(fname)
    if verbose: print 'read ', fname
    mol = mols[0]
    if len(mols)>1:
        if verbose: 
            print "more than one molecule in file"
        #use the one molecule with the most atoms
        ctr = 1
        for m in mols[1:]:
            ctr += 1
            if len(m.allAtoms)>len(mol.allAtoms):
                mol = m
                if verbose:
                    print "mol set to ", ctr, "th molecule with", len(mol.allAtoms), "atoms"
    coord_dict = {}
    for a in mol.allAtoms: coord_dict[a] = a.coords

    mol.buildBondsByDistance()
    if charges_to_add is not None:
        preserved = {}
        preserved_types = preserve_charge_types.split(',') 
        for t in preserved_types:
            if not len(t): continue
            ats = mol.allAtoms.get(lambda x: x.autodock_element==t)
            for a in ats:
                if a.chargeSet is not None:
                    preserved[a] = [a.chargeSet, a.charge]

    if verbose:
        print "setting up LPO with mode=", mode,
        print "and outputfilename= ", outputfilename
        print "and check_for_fragments=", check_for_fragments
        print "and bonds_to_inactivate=", bonds_to_inactivate
    LPO = AD4LigandPreparation(mol, mode, repairs, charges_to_add, 
                            cleanup, allowed_bonds, root, 
                            outputfilename=outputfilename,
                            dict=dict, check_for_fragments=check_for_fragments,
                            bonds_to_inactivate=bonds_to_inactivate, 
                            inactivate_all_torsions=inactivate_all_torsions,
                            attach_nonbonded_fragments=attach_nonbonded_fragments)
    #do something about atoms with too many bonds (?)
    #FIX THIS: could be peptide ligand (???)
    #          ??use isPeptide to decide chargeSet??
    if charges_to_add is not None:
        #restore any previous charges
        for atom, chargeList in preserved.items():
            atom._charges[chargeList[0]] = chargeList[1]
            atom.chargeSet = chargeList[0]
    if verbose: print "returning ", mol.returnCode 
    bad_list = []
    for a in mol.allAtoms:
        if a.coords!=coord_dict[a]: bad_list.append(a)
    if len(bad_list):
        print len(bad_list), ' atom coordinates changed!'    
        for a in bad_list:
            print a.name, ":", coord_dict[a], ' -> ', a.coords
    else:
        if verbose: print "No change in atomic coordinates"
    if mol.returnCode != 0: 
        sys.stderr.write(mol.returnMsg + "\n")
    # sys.exit(mol.returnCode)

if __name__ == '__main__':
    import sys
    import getopt

    def usage():
        "Print helpful, accurate usage statement to stdout."
        print "Usage: prepare_ligand4.py -l filename"
        print
        print "    Description of command..."
        print "         -l     ligand_filename       (.pdb or .mol2 or .pdbq format)"
        print "         -i     list_of_filenames.txt (.pdb or .mol2 or .pdbq format)"
        print "    Optional parameters:"
        print "        [-v]    verbose output"
        print "        [-o pdbqt_filename] (default output filename is ligand_filename_stem + .pdbqt)"
        print "        [-d]    dictionary to write types list and number of active torsions "

        print "        [-A]    type(s) of repairs to make:\n\t\t bonds_hydrogens, bonds, hydrogens (default is to do no repairs)"
        print "        [-C]    do not add charges (default is to add gasteiger charges)"
        print "        [-p]    preserve input charges on atom type, eg -p Zn"
        print "               (default is not to preserve charges on any specific atom type)"
        print "        [-U]    cleanup type:\n\t\t nphs_lps, nphs, lps, '' (default is 'nphs_lps') "
        print "        [-B]    type(s) of bonds to allow to rotate "
        print "               (default sets 'backbone' rotatable and 'amide' + 'guanidinium' non-rotatable)"
        print "        [-R]    index for root"
        print "        [-F]    check for and use largest non-bonded fragment (default is not to do this)"
        print "        [-M]    interactive (default is automatic output)"
        print "        [-I]    string of bonds to inactivate composed of "
        print "                   of zero-based atom indices eg 5_13_2_10  "
        print "                   will inactivate atoms[5]-atoms[13] bond "
        print "                               and atoms[2]-atoms[10] bond "
        print "                      (default is not to inactivate any specific bonds)"
        print "        [-Z]    inactivate all active torsions     "
        print "                      (default is leave all rotatable active except amide and guanidinium)"
        print "        [-g]    attach all nonbonded fragments "
        print "                      (default is not to do this)"

    # process command arguments
    try:
        opt_list, args = getopt.getopt(sys.argv[1:], 'l:i:vo:d:A:Cp:U:B:R:MFI:Zgh')
    except getopt.GetoptError, msg:
        print 'prepare_ligand4.py: %s' %msg
        usage()
        sys.exit(2)

    #'l:vo:d:A:CKU:B:R:MFI:Zg'
    for o, a in opt_list:
        #print "o=", o, " a=", a
        if o in ('-l', '--l'):
            ligand_filename = a
            if verbose: print 'set ligand_filename to ', a
        if o in ('-i', '--i'):
            ligand_listfile = a
            if verbose: print 'set ligand_listfile to ', a
        if o in ('-v', '--v'):
            verbose = True
            if verbose: print 'set verbose to ', True
        if o in ('-o', '--o'):
            outputfilename = a
            if verbose: print 'set outputfilename to ', a
        if o in ('-d', '--d'):
            dict = a
            if verbose: print 'set dict to ', a
        if o in ('-A', '--A'):
            repairs = a
            if verbose: print 'set repairs to ', a
        if o in ('-C', '--C'):
            charges_to_add = None
            if verbose: print 'do not add charges'
        if o in ('-p', '--p'):
            preserve_charge_types+=a
            preserve_charge_types+=','
            if verbose: print 'preserve initial charges on ', preserve_charge_types
        if o in ('-U', '--U'):
            cleanup  = a
            if verbose: print 'set cleanup to merge ', a
        if o in ('-B', '--B'):
            allowed_bonds = a
            if verbose: print 'allow ', a, 'bonds set to rotate'
        if o in ('-R', '--R'):
            root = a
            if verbose: print 'set root to ', root
        if o in ('-F', '--F'):
            check_for_fragments = True
            if verbose: print 'set check_for_fragments to True'
        if o in ('-M', '--M'):
            mode = a
            if verbose: print 'set mode to ', a
        if o in ('-I', '--I'):
            bonds_to_inactivate = a
            if verbose: print 'set bonds_to_inactivate to ', a
        if o in ('-Z', '--Z'):
            inactivate_all_torsions = True
            if verbose: print 'set inactivate_all_torsions to ', inactivate_all_torsions
        if o in ('-g', '--g'):
            attach_nonbonded_fragments = True
            if verbose: print 'set attach_nonbonded_fragments to ', attach_nonbonded_fragments
        if o in ('-h', '--'):
            usage()
            sys.exit()

    if ligand_filename:
        process_file(ligand_filename)
    elif ligand_listfile:
        # Python 2.5 does not support `with`
        # with open(ligand_listfile) as inf:
        #     for fname in inf:
        #         process_file(fname.rstrip())
        inf = open(ligand_listfile)
        for fname in inf:
            process_file(fname.rstrip())
        inf.close()
    else:
        print 'prepare_ligand4: either -l (ligand filename) or -i (ligand listfile) must be specified.'
        usage()
        sys.exit()

# To execute this command type:
# prepare_ligand4.py -l pdb_file -v