需要帮助尝试使Python多进程池工作

时间:2015-02-06 14:48:16

标签: python pyodbc python-multiprocessing

我有一个数据库表我正在读取行(在这个例子中~360k行)并将pyodbc.row对象放入一个列表供以后使用,然后用这个脚本编写。

from hashlib import sha1
import multiprocessing
import datetime, os
import pyodbc 
import math
import traceback, sys
source_rows = []
processors = 8

def split(a, n):
    k, m = len(a) / n, len(a) % n
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))

def sqn(*args):
    return sha1('}#EUCLID#{'.join([str(arg.upper().strip()) for arg in args]).encode()).hexdigest().upper()

def sDB_read():
    t1 = datetime.datetime.now()
    #Initialize Source Database
    src_driver='{SQL Server Native Client 11.0}'
    src_server='SCCMSV2SS010'
    src_database='STAGE'
    src_trusted_conn='yes'
    src_uid = 'myUserID'
    src_pwd = 'myPwd'
    if src_trusted_conn == 'yes':
        src_conn_str =  """Driver=""" + src_driver + """;Server=""" + src_server + """;Database=""" + src_database + """;Trusted_Connection=""" + src_trusted_conn + """;PacketSize=32767;"""
    else:
        src_conn_str =  """Driver=""" + src_driver + """;Server=""" + src_server + """;Database=""" + src_database + """;UID=""" + src_UID + """;PWD=""" + src_pwd + """;PacketSize=32767;"""
    sql = 'SELECT [AgentID]               ,[BootDevice00]              ,[BuildNumber00]              ,[BuildType00]              ,[Caption00]              ,[CodeSet00]              ,[CountryCode00]              ,[CSDVersion00]              ,[CurrentTimeZone00]              ,[Debug00]              ,[Description00]              ,[Distributed00]              ,[ForegroundApplicationBoost00]              ,[FreePhysicalMemory00]              ,[FreeSpaceInPagingFiles00]              ,[FreeVirtualMemory00]              ,[InstallDate00]              ,[InstanceKey]              ,[LastBootUpTime00]              ,[LocalDateTime00]              ,[Locale00]              ,[MachineID]              ,[Manufacturer00]              ,[MaxNumberOfProcesses00]              ,[MaxProcessMemorySize00]              ,[Name00]              ,[NumberOfLicensedUsers00]              ,[NumberOfProcesses00]              ,[NumberOfUsers00]              ,[OperatingSystemSKU00]              ,[Organization00]              ,[OSArchitecture00]              ,[OSLanguage00]              ,[OSProductSuite00]              ,[OSType00]              ,[OtherTypeDescription00]              ,[PlusProductID00]              ,[PlusVersionNumber00]              ,[Primary00]              ,[ProductType00]              ,[RegisteredUser00]              ,[RevisionID]              ,[rowversion]              ,[SerialNumber00]              ,[ServicePackMajorVersion00]              ,[ServicePackMinorVersion00]              ,[SizeStoredInPagingFiles00]              ,[Status00]              ,[SystemDevice00]              ,[SystemDirectory00]              ,[TimeKey]              ,[TotalSwapSpaceSize00]              ,[TotalVirtualMemorySize00]              ,[TotalVisibleMemorySize00]              ,[Version00]              ,[WindowsDirectory00]         FROM [STAGE].[dbo].[Operating_System_DATA]'
    src_db_conn = pyodbc.connect(src_conn_str)
    src_db_conn.autocommit = False
    src_db_cursor = src_db_conn.cursor()    
    src_db_cursor.execute( sql )    
    source_rows = [ {c[0]: v for (c, v) in zip(row.cursor_description, row)} for row in src_db_cursor.fetchall() ]
    t2 = datetime.datetime.now()
    print('\nWe Read ' + str(len(source_rows)) + ' rows in ' + str( t2 - t1 ))
    return source_rows

def tDB_write():    
    print('\nPOOL: Received ' + str(len( source_rows )) + ' rows to work on')
    t1 = datetime.datetime.now()
    #Initialize Target Database    
    targ_driver='{SQL Server Native Client 11.0}'
    targ_server='SCCMSV2SS010'
    targ_database='STAGE'
    targ_trusted_conn='yes'
    targ_uid = 'myUserID'
    targ_pwd = 'myPwd'
    if targ_trusted_conn == 'yes':
        targ_conn_str =  """Driver=""" + targ_driver + """;Server=""" + targ_server + """;Database=""" + targ_database + """;Trusted_Connection=""" + targ_trusted_conn + """;PacketSize=32767;"""
    else:
        targ_conn_str =  """Driver=""" + targ_driver + """;Server=""" + targ_server + """;Database=""" + targ_database + """;UID=""" + targ_UID + """;PWD=""" + targ_pwd + """;PacketSize=32767;"""
    targ_db_conn = pyodbc.connect(targ_conn_str)
    targ_db_conn.autocommit = False
    targ_db_cursor = targ_db_conn.cursor()           
    table = 'Operating_System_DATA_INSERT'    
    for sourceitems in source_rows:         
        for source_row in sourceitems:
            try:
                sql = ''                        
                #print( str( source_row ) )
                columns = ', '.join( source_row.keys() )            
                placeholders = ', '.join(['?'] * len( source_row ))
                obj = source_row.values()
                sql = "targ_db_cursor.execute('INSERT into {} ( {} ) VALUES ( {} )', {} )".format(table, columns, placeholders , obj )
                #print( sql )
                res = eval( sql )
            except Exception,e:
                traceback.print_exc(file=sys.stdout)            
    targ_db_conn.commit()
    t2 = datetime.datetime.now()
    print('\nWe Wrote ' + str(len(source_rows)) + ' rows in ' + str( t2 - t1 ))
    return

if __name__ == '__main__':    
    print( '\nStarting multiprocessing pool..' )
    pool = multiprocessing.Pool( processes = processors )
    source_rows = sDB_read()
    print( type(source_rows) )
    targetsize = len(source_rows)
    print( '\nProcessing ' + str(targetsize) + ' rows')
    chunksize = math.ceil(len(source_rows) / processors )    
    print( 'Splitting into ' + str(processors) + " chunks with " + str(chunksize) + ' rows each')
    source_rows = list(split( source_rows , processors ))
    write_pool_outputs = pool.map( tDB_write() , source_rows )
    print( '\nClosing multiprocessing pool..' )
    pool.close()
    pool.join() 

提供以下输出

c:\Python27>.\python27.exe .\multitest.py
Starting multiprocessing pool..
We Read 361554 rows in 0:00:14.064000
<type 'list'>
Processing 361554 rows
Splitting into 8 chunks with 45194.0 rows each
POOL: Received 8 rows to work on

我想我不清楚如何拆分/分块列表,以便每个工作人员获得相同的行份额。无论我是尝试手动拆分列表还是只是将未经修改的列表直接传递给pool.map,只有一个工作程序启动获取所有行。有人可以用正确的方法教育我吗?

1 个答案:

答案 0 :(得分:4)

我想你可能会错过pool.map调用中的chunksize:

 map(func, iterable[, chunksize])

我在这里解释了chunksize:Python: multiprocess workers, tracking tasks completed (missing completions)

希望这有帮助。