Python多处理:处理带有错误记录的输入文件列表

时间:2016-08-17 12:04:17

标签: python logging python-multiprocessing error-logging

我想使用python多处理来执行以下操作:

  • 处理一长串输入文件
  • 包含错误记录
  • 对正在使用的并发CPU核心(进程数)设置限制

python logging cookbook有两个很好的多处理示例。在下面的代码中,我修改了使用multiprocessing.Queue的第二种方法(“在主进程中登录,在单独的线程中”)。对于我自己和新用户,我都添加了详细的注释,并创建了示例输入和输出文件。

我遇到的问题是代码会迭代CPU内核的数量,而不是通过列表中的项目数量。

如何在不超出并发进程数限制的情况下将该函数应用于我的所有输入文件?

import json
import logging
import multiprocessing
import numpy as np
import os
import pandas as pd
import threading
import time

def create_10_infiles():
    """Creates 10 csv files with 4x4 array of floats, + occasional strings"""
    list_csv_in = []
    for i in range(1,11):
        csv_in = "{:02d}_in.csv".format(i)
        # create a 4 row, 4 column dataframe with random values centered around i
        df = pd.DataFrame(np.random.rand(16).reshape(4,4) * i)
        # add a string to one of the arrays (as a reason to need error logging)
        if i == 2 or i == 8:
            df.loc[2,2] = "Oops, array contains a string. Welcome to data science."
        # save to csv, and append filename to list of inputfiles
        df.to_csv(csv_in)
        list_csv_in.append(csv_in)
    return list_csv_in

def logger_thread(queue):
    """Listener process that logs output received from other processes?"""
    while True:
        record = queue.get()
        if record is None:
            break
        logger = logging.getLogger(record.name)
        logger.handle(record)

def worker_process(queue, infile):
    """Worker process that used to run tasks.
    Each process is isolated, so it starts by setting up logging."""
    # set up a handle to hold the logger output?
    queue_handle = logging.handlers.QueueHandler(queue)
    # creates a new logger called "process logger" (printed in each line)
    logger = logging.getLogger("process logger")
    # sets the logging level to DEBUG, so logger.info messages are printed.
    logger.setLevel(logging.DEBUG)
    # connects logger to handle defined above?
    logger.addHandler(queue_handle)
    # here you can run your desired program, in the hope that the time saved from parallel
    # processing is greater than the overhead of setting up all those processes and loggers:)
    normalise_array_to_mean_and_save(infile, logger)

def normalise_array_to_mean_and_save(csv_in, logger):
    """Opens csv with array, checks dtypes, calculates mean, saves output csv."""
    # check if file exists
    if os.path.isfile(csv_in):
        # open as pandas dataframe
        df = pd.read_csv(csv_in)
        # if none of the columns contain mixed datatypes (i.e, a string)
        if not pd.np.dtype('object') in df.dtypes.tolist():
            # calc mean over whole dataframe
            mean = df.stack().mean()
            logger.info("{}, Mean = {:0.2f}".format(csv_in, mean))
            # normalise all values to mean. Save as "01_out.csv", "02_out.csv" etc
            df = df / mean
            csv_out = csv_in[:-6] + "out.csv"
            df.to_csv(csv_out)
        else:
            logger.info("{}, Mean not calculated. Non-float values found.".format(csv_in))

if __name__ == '__main__':
    os.chdir(r"D:\data")
    # import your favourite json logging settings (collapsed for brevity)
    logsettings = json.dumps({"version": 1, "root": {"handlers": ["console", "file"], "level": "DEBUG"}, "formatters": {"detailed": {"class": "logging.Formatter", "format": "%(asctime)s %(name)-15s %(levelname)-8s %(processName)-10s %(message)s"}}, "handlers": {"console": {"class": "logging.StreamHandler", "level": "DEBUG"}, "file": {"mode": "w", "formatter": "detailed", "class": "logging.FileHandler", "filename": "my_multiprocessing_logfile.log"}}})
    config = json.loads(logsettings)
    # replace default logfile with a filename containing the exact time
    config['handlers']['file']['filename'] = time.strftime("%Y%m%d_%H_%M_%S") + "_mp_logfile.txt"
    # load the logging settings
    logging.config.dictConfig(config)

    queue = multiprocessing.Queue()
    workers = []
    # set the number of concurrent processes created (i.e. CPU cores used)
    num_processes = 4

    # create 10 csv files with data, and return the list of filepaths
    list_10_infiles = create_10_infiles()

    # set up a process for each CPU core (e.g. 4)
    for i in range(num_processes):
        wp = multiprocessing.Process(target=worker_process,
                                     name='worker_{}'.format(i+1),
                                     args=(queue, list_10_infiles[i]))
        workers.append(wp)
        wp.start()

    # set up a thread as the logger_process
    logger_process = threading.Thread(target=logger_thread, args=(queue,))
    logger_process.start()

    #At this point, the main process could do some useful work of its own
    #Once it's done that, it can wait for the workers to terminate...
    for wp in workers:
        wp.join()

    # set logger for main process if desired
    root = logging.getLogger("main")
    root.setLevel(logging.DEBUG)
    logger = logging.getLogger("main logger")
    logger.info("CPUs used = {}/{}".format(num_processes, multiprocessing.cpu_count()))
    logger.info('Program is finished. All files analysed.')

    # And now tell the logging thread to finish up, too
    queue.put(None)
    logger_process.join()

注意:我已经尝试将输入文件列表分成块,具体取决于CPU核心的数量。这处理了文件,但速度很慢。

1 个答案:

答案 0 :(得分:0)

我发现使用python多处理池而不是Queue允许我处理一长串文件,并限制并发核心数。

虽然日志记录与Pool不兼容,但我发现可以收集返回值。假设代码没有抛出异常,则可以在处理完所有文件后记录返回值。

也许有人可以给我一个更优雅的解决方案,但目前这解决了这个问题。

from multiprocessing import Pool
from time import strftime
import logging

def function_to_process_files(file):
    #..check file integrity, etc..
    if file_gives_an_error:
        return "{} file {} gave an error".format(strftime("%Y%m%d_%H_%M_%S"), file)
    #..do stuff without using the logging module..
    #.. for slow, irregular processes, printing to console is possible..
    return "{} file {} processed correctly".format(strftime("%Y%m%d_%H_%M_%S"), file)

if __name__ == "__main__":

    list_of_files_to_process = define_your_file_list_somehow()

    logging = logging.setup_regular_logging_to_file_as_desired()

    # define the number of CPU cores to be used concurrently
    n_processes = 4

    with Pool(processes=n_processes) as pool:
        list_of_return_statements = pool.map(function_to_process_files, list_of_files_to_process)
    # now transfer the list of return statements to the logfile
    for return_statement in list_of_return_statements:
        logging.info(return_statement)