我有一个简单的功能,可以扫描文件中的特殊字符串,但是由于这些文件位于缓慢的远程文件存储中,因此我需要并行扫描它们。
我想我需要使用多重处理,但是我不确定如何正确地做到这一点。
这是我的功能:
from fnmatch import fnmatch
import os
from shutil import copy
from pprint import pprint
def getFailedFile(directory_name, folder_to_write):
for file in os.listdir(directory_name):
if fnmatch(file, '*Response.txt'):
filename = directory_name + file
try:
with open(filename, 'r', encoding='utf-8') as myfile:
data = myfile.read()
if data.find('Exception') != -1:
try:
requestFile = directory_name + file.replace('Response', 'Request')
copy(requestFile, os.getcwd() + folder_to_write)
except FileNotFoundError:
print('no such file - ', requestFile)
except UnicodeDecodeError:
print('error unicode decode -', filename)
directory_name = 'some folder'
folder_to_write = 'some folder_to_write'
getFailedFile(directory_name=directory_name, folder_to_write)
请帮助。由于目标文件夹中的文件数量,目前大约需要4个小时。
答案 0 :(得分:0)
最后弄清楚了该怎么做:
from fnmatch import fnmatch
import os
from shutil import copy
from multiprocessing import Pool
import time
import logging
def process_file(file):
directory_name = 'directory with files'
if fnmatch(file, '*Response.txt'):
filename = directory_name + file
try:
with open(filename, 'r', encoding='utf-8') as myfile:
data = myfile.read()
if data.find('xception') != -1:
try:
requestFile = directory_name + file.replace('Response', 'Request')
responseFile = directory_name + file
try:
copy(requestFile, 'directory to write')
copy(responseFile, 'directory to write')
except Exception as e:
logging.info(str(e) + '\n')
print(str(e))
except FileNotFoundError:
print('no such file - ', requestFile)
logging.info('no such file - ' + str(requestFile) + '\n')
except UnicodeDecodeError:
print('error unicode decode -', filename)
logging.info('error unicode decode -' + str(filename) + '\n')
if __name__ == '__main__':
try:
directory_name = 'directory with files'
number_of_processes = 50
logging.info('\n' + 'Number of processes - ' + str(number_of_processes))
logging.info('Directory to scan ' + directory_name)
pool = Pool(number_of_processes)
start_time = time.time()
pool.map(process_file, os.listdir(directory_name))
pool.close()
elapsed_time = time.time() - start_time
logging.info('Elapsed time - ' + str(elapsed_time / 60) + '\n')
except Exception as e:
logging.info(str(e) + '\n')
我知道代码不是很漂亮,但是可以工作27分钟,而不是以前的时间。