Question

我有一个使用多处理池映射的python代码。我从地图中产生了多个孩子，每个孩子都读取一个单独的文件，最后我将它们收集起来。我的目标是让一个pandas数据帧最后是子进程的所有输出的串联，并删除重复项。我使用这个数据帧进行更多的处理（其余的代码似乎与我在这里提出的问题无关，所以为了简洁，我省略了这一部分）。此代码在一周结束时定期运行，每次都要读取新的输入文件。有时孩子读取的文件中有错误，例如整数列中的空值或丢失的文件等。如果发生任何这些错误，我希望主脚本尽快死掉。我不知道如何以最有效的方式实现这一目标。

我反过来尝试过： 1-如果遇到错误，通过提升SystemExit（1）使孩子死亡。我无法让父母死去。 2-使子项返回空值或pandas数据帧以防出错，尝试除块之外。我无法在父母身上正确地检测到它。 3 - 使用带有回调函数的map_async而不是map。

最后一个似乎有效。但是，我不确定这是否是正确且最有效的方法，因为我不使用错误回调函数的任何输出。任何意见和建议表示赞赏。

编辑：

示例输入文件：a.txt：

shipmentId,processing_time_epoch
4001,1455408024132
4231,1455408024373

b.txt：

shipmentId,processing_time_epoch
5001,1455408024132
4231,1455408024373

所需的最终processing_time pandas数据帧：

shipmentId,processing_time_epoch
4001,1455408024132
4231,1455408024373
5001,1455408024132

我的代码：

import pandas as pd
import csv,glob,datetime,sys,pdb,subprocess,multiprocessing,io,os,shlex
from itertools import repeat

def myerrorcallback(x):
    print('There seems to be an error in the child. Parent: Please die.')
    return

def mycallback(x):
    print('Returned successfully.')
    return

def PrintException():
    exc_type, exc_obj, tb = sys.exc_info()
    f = tb.tb_frame
    lineno = tb.tb_lineno
    filename = f.f_code.co_filename
    print('EXCEPTION IN ({}, LINE {} ): {} ({})'.format(filename, lineno, exc_obj, exc_type))
    return

# ===================================================================
def Read_Processing_Times_v1(full_path_name):
    try:
        df = pd.read_csv(full_path_name,dtype={'shipmentId': pd.np.int64, 'processing_time_epoch': pd.np.int64}, usecols=['shipmentId','processing_time_epoch'])
        return df.drop_duplicates()
    except:
        print("exception in file "+full_path_name)
        PrintException()
        raise(SystemExit(1))

# ===================================================================
def Read_Processing_Times_v2(full_path_name):
    try:
        df = pd.read_csv(full_path_name,dtype={'shipmentId': pd.np.int64, 'processing_time_epoch': pd.np.int64}, usecols=['shipmentId','processing_time_epoch'])
        return df.drop_duplicates()
    except:
        print("exception in file "+full_path_name)
        PrintException()
    return pd.DataFrame()


# ===================================================================
def Read_Processing_Times_v3(full_path_name):
    df = pd.read_csv(full_path_name,dtype={'shipmentId': pd.np.int64,'processing_time_epoch': pd.np.int64}, usecols=['shipmentId','processing_time_epoch'])
    return df.drop_duplicates()

# ===========================================================================================================================
# Top-level
if __name__ == '__main__':

    mycols = ['shipmentId', 'processing_time_epoch']
    mydtypes = {'shipmentId': pd.np.int64, 'processing_time_epoch': pd.np.int64}

    # The following two files should not give an error:
    # files_to_read=["a.txt","b.txt"]

    # The following two files should give an error, as a2.txt does not exist:
    files_to_read=["a2.txt","b.txt"]

    # version 1: Works with the correct files. Does not work if one of the children has an error: the child dies, the parent does not and waits forever.
    # print("version 1")
    # pool = multiprocessing.Pool(15)
    # processing_times = pool.map(Read_Processing_Times_v1, files_to_read)
    # pool.close()
    # pool.join()
    # processing_times = pd.concat(processing_times,ignore_index=True).drop_duplicates()
    # print(processing_times)


    # version 2: Does not work. Don't know how to fix it. The idea is make child return something, and catch the error in the parent.
    # print("version 2")
    # pool = multiprocessing.Pool(15)
    # processing_times = pool.map(Read_Processing_Times_v2, files_to_read)
    # pool.close()
    # pool.join()
    # if(processing_times.count(pd.DataFrame()) > 0):
    #     print("SLAM times are not read properly.")
    #     raise SystemExit(1)

    # version 3:
    print("version 3")
    pool = multiprocessing.Pool(15)
    processing_times = pool.map_async(Read_Processing_Times_v3, files_to_read,callback=mycallback,error_callback=myerrorcallback)
    pool.close()
    pool.join()
    processing_times = processing_times.get()
    processing_times = pd.concat(processing_times,ignore_index=True).drop_duplicates()


    print("success!")

    # Do more processing with processing_times after this line...

Answer 1

我认为您可以使用concurrent.futures模块（https://docs.python.org/3/library/concurrent.futures.html）来完成您想要的任务。以下是我修改的文档页面中的示例，以便更接近您的问题。在示例中，如果work_func返回被认为是错误的False并且程序将终止。

import sys
import concurrent.futures
import random
import time


def work_func(input_val):
    """
    Do some work.  Here a False value would mean there is an error
    """
    time.sleep(0.5)
    return random.choice([True, True, True, True, False])


if __name__ == "__main__":
    # We can use a with statement to ensure processes are cleaned up promptly
    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
        # Start the load operations and mark each future with its input value
        future_to_result = {executor.submit(work_func, val): val for val in range(30)}

        # iterate over the futures as they become available
        for future in concurrent.futures.as_completed(future_to_result):

            #  get the input value from the dict
            input_val = future_to_result[future]

            # now retrieve the result from the future
            try:
                data = future.result()
            except Exception as exc:
                print(input_val, data)
                print('Something exceptional happend')
            else:
                print(input_val, data)
                if not data:
                    print('Error - exiting')
                    sys.exit(1)

示例输出：

0 True
1 True
2 True
3 False
Error - exiting

捕获多处理池映射中的错误

1 个答案: