读取多个文件进行多处理

时间:2018-11-06 14:06:36

标签: python-3.x multiprocessing

我有一个简单的功能,可以扫描文件中的特殊字符串,但是由于这些文件位于缓慢的远程文件存储中,因此我需要并行扫描它们。

我想我需要使用多重处理,但是我不确定如何正确地做到这一点。

这是我的功能:

from fnmatch import fnmatch
import os
from shutil import copy
from pprint import pprint

def getFailedFile(directory_name, folder_to_write):
    for file in os.listdir(directory_name):
        if fnmatch(file, '*Response.txt'):
            filename = directory_name + file
            try:
                with open(filename, 'r', encoding='utf-8') as myfile:
                    data = myfile.read()
                    if data.find('Exception') != -1:
                        try:
                            requestFile = directory_name + file.replace('Response', 'Request')
                            copy(requestFile, os.getcwd() + folder_to_write)
                        except FileNotFoundError:
                            print('no such file - ', requestFile)
            except UnicodeDecodeError:
                print('error unicode decode -', filename)

directory_name = 'some folder'
folder_to_write = 'some folder_to_write'
getFailedFile(directory_name=directory_name, folder_to_write)

请帮助。由于目标文件夹中的文件数量,目前大约需要4个小时。

1 个答案:

答案 0 :(得分:0)

最后弄清楚了该怎么做:

from fnmatch import fnmatch
import os
from shutil import copy
from multiprocessing import Pool
import time
import logging

def process_file(file):
directory_name = 'directory with files'
if fnmatch(file, '*Response.txt'):
    filename = directory_name + file
    try:
        with open(filename, 'r', encoding='utf-8') as myfile:
            data = myfile.read()
            if data.find('xception') != -1:
                try:
                    requestFile = directory_name + file.replace('Response', 'Request')
                    responseFile = directory_name + file
                    try:
                        copy(requestFile, 'directory to write')
                        copy(responseFile, 'directory to write')
                    except Exception as e:
                        logging.info(str(e) + '\n')
                        print(str(e))
                except FileNotFoundError:
                    print('no such file - ', requestFile)
                    logging.info('no such file - ' + str(requestFile) + '\n')
    except UnicodeDecodeError:
        print('error unicode decode -', filename)
        logging.info('error unicode decode -' + str(filename) + '\n')

if __name__ == '__main__':
try:
    directory_name = 'directory with files'
    number_of_processes = 50

    logging.info('\n' + 'Number of processes - ' + str(number_of_processes))
    logging.info('Directory to scan ' + directory_name)

    pool = Pool(number_of_processes)

    start_time = time.time()
    pool.map(process_file, os.listdir(directory_name))  
    pool.close()
    elapsed_time = time.time() - start_time

    logging.info('Elapsed time - ' + str(elapsed_time / 60) + '\n')
except Exception as e:
    logging.info(str(e) + '\n')

我知道代码不是很漂亮,但是可以工作27分钟,而不是以前的时间。