Question

我使用模块multiprocessing编写了一个程序，该模块全局执行如下：

启动simulation和ui进程。
simulation进程使用新的模拟状态提供队列。如果队列已满，则模拟循环不会被阻止，因此它可以处理可能的传入消息。
ui进程使用模拟队列。
执行时间大约1秒后，ui进程向主进程发送quit事件，然后退出循环。退出后，它会通过stopped的内部_create_process()功能向主进程发送wrapper()个事件。
主进程以任何顺序接收两个事件。 quit事件导致主进程向所有子进程发送stop信号，而stopped事件在主循环中递增计数器，这将导致它在收到之后退出许多stopped事件都有进程。
simulation进程收到stop事件并退出循环，然后将stopped事件发送到主进程。
主要流程现在总共收到了2个stopped个事件，并得出结论：所有子进程都在被停止的路上。结果，主循环退出
run()函数刷新子进程写入的队列。
正在加入子进程。

问题在于，根据下面的日志，尝试加入simulation进程时，程序会经常（但并非总是）挂起。

[...]
[INFO/ui] process exiting with exitcode 0
[DEBUG/MainProcess] starting thread to feed data to pipe
[DEBUG/MainProcess] ... done self._thread.start()
[DEBUG/simulation] Queue._start_thread()
[DEBUG/simulation] doing self._thread.start()
[DEBUG/simulation] starting thread to feed data to pipe
[DEBUG/simulation] ... done self._thread.start()
[DEBUG/simulation] telling queue thread to quit
[DEBUG/MainProcess] all child processes (2) should have been stopped!
[INFO/simulation] process shutting down
[DEBUG/simulation] running all "atexit" finalizers with priority >= 0
[DEBUG/simulation] telling queue thread to quit
[DEBUG/simulation] running the remaining "atexit" finalizers
[DEBUG/simulation] joining queue thread
[DEBUG/MainProcess] joining process <Process(simulation, started)>
[DEBUG/simulation] feeder thread got sentinel -- exiting
[DEBUG/simulation] ... queue thread joined
[DEBUG/simulation] joining queue thread

通过shell中的Ctrl + C停止执行会导致这些损坏的回溯：

Process simulation:
Traceback (most recent call last):
Traceback (most recent call last):
  File "./debug.py", line 224, in <module>
    run()
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/process.py", line 257, in _bootstrap
    util._exit_function()
  File "./debug.py", line 92, in run
    process.join()  #< This doesn't work.
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/util.py", line 312, in _exit_function
    _run_finalizers()
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/process.py", line 121, in join
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/util.py", line 252, in _run_finalizers
    finalizer()
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/util.py", line 185, in __call__
    res = self._callback(*self._args, **self._kwargs)
    res = self._popen.wait(timeout)
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/popen_fork.py", line 54, in wait
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/queues.py", line 196, in _finalize_join
    thread.join()
    return self.poll(os.WNOHANG if timeout == 0.0 else 0)
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/popen_fork.py", line 30, in poll
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/threading.py", line 1060, in join
    self._wait_for_tstate_lock()
  File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/threading.py", line 1076, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
KeyboardInterrupt

至于代码，这里是它的简化版本（因此它常常看起来不完整）：

#!/usr/bin/env python3

import logging
import multiprocessing
import pickle
import queue
import time

from collections import namedtuple

_LOGGER = multiprocessing.log_to_stderr()
_LOGGER.setLevel(logging.DEBUG)

_BUFFER_SIZE = 4
_DATA_LENGTH = 2 ** 12

_STATUS_SUCCESS = 0
_STATUS_FAILURE = 1

_EVENT_ERROR = 0
_EVENT_QUIT = 1
_EVENT_STOPPED = 2

_MESSAGE_STOP = 0
_MESSAGE_EVENT = 1
_MESSAGE_SIMULATION_UPDATE = 2

_Message = namedtuple('_Message', ('type', 'value',))
_StopMessage = namedtuple('_StopMessage', ())
_EventMessage = namedtuple('_EventMessage', ('type', 'value',))
_SimulationUpdateMessage = namedtuple('_SimulationUpdateMessage', ('state',))

_MESSAGE_STRUCTS = {
    _MESSAGE_STOP: _StopMessage,
    _MESSAGE_EVENT: _EventMessage,
    _MESSAGE_SIMULATION_UPDATE: _SimulationUpdateMessage
}

def run():
    # Messages from the main process to the child ones.
    downward_queue = multiprocessing.Queue()
    # Messages from the child processes to the main one.
    upward_queue = multiprocessing.Queue()
    # Messages from the simulation process to the UI one.
    simulation_to_ui_queue = multiprocessing.Queue(maxsize=_BUFFER_SIZE)

    # Regroup all the queues that can be written by child processes.
    child_process_queues = (upward_queue, simulation_to_ui_queue,)

    processes = (
        _create_process(
            _simulation,
            upward_queue,
            name='simulation',
            args=(
                simulation_to_ui_queue,
                downward_queue
            )
        ),
        _create_process(
            _ui,
            upward_queue,
            name='ui',
            args=(
                upward_queue,
                simulation_to_ui_queue,
                downward_queue
            )
        )
    )

    try:
        for process in processes:
            process.start()

        _main(downward_queue, upward_queue, len(processes))
    finally:
        # while True:
        #     alive_processes = tuple(process for process in processes
        #                             if process.is_alive())
        #     if not alive_processes:
        #         break

        #     _LOGGER.debug("processes still alive: %s" % (alive_processes,))

        for q in child_process_queues:
            _flush_queue(q)

        for process in processes:
            _LOGGER.debug("joining process %s" % process)
            # process.terminate()  #< This works!
            process.join()  #< This doesn't work.

def _main(downward_queue, upward_queue, process_count):
    try:
        stopped_count = 0
        while True:
            message = _receive_message(upward_queue, False)
            if message is not None and message.type == _MESSAGE_EVENT:
                event_type = message.value.type
                if event_type in (_EVENT_QUIT, _EVENT_ERROR):
                    break
                elif event_type == _EVENT_STOPPED:
                    stopped_count += 1
                    if stopped_count >= process_count:
                        break
    finally:
        # Whatever happens, make sure that all child processes have stopped.
        if stopped_count >= process_count:
            return

        # Send a 'stop' signal to all the child processes.
        for _ in range(process_count):
            _send_message(downward_queue, True, _MESSAGE_STOP)

        while True:
            message = _receive_message(upward_queue, False)
            if (message is not None
                    and message.type == _MESSAGE_EVENT
                    and message.value.type == _EVENT_STOPPED):
                stopped_count += 1
                if stopped_count >= process_count:
                    _LOGGER.debug(
                        "all child processes (%d) should have been stopped!"
                        % stopped_count
                    )
                    break

def _simulation(simulation_to_ui_queue, downward_queue):
    simulation_state = [i * 0.123 for i in range(_DATA_LENGTH)]

    # When the queue is full (possibly form reaching _BUFFER_SIZE), the next
    # solve is computed and kept around until the queue is being consumed.
    next_solve_message = None
    while True:
        message = _receive_message(downward_queue, False)
        if message is not None and message.type == _MESSAGE_STOP:
            break

        if next_solve_message is None:
            # _step(simulation_state)

            # Somehow the copy (pickle) seems to increase the chances for
            # the issue to happen.
            next_solve_message = _SimulationUpdateMessage(
                state=pickle.dumps(simulation_state)
            )

        status = _send_message(simulation_to_ui_queue, False,
                               _MESSAGE_SIMULATION_UPDATE,
                               **next_solve_message._asdict())
        if status == _STATUS_SUCCESS:
            next_solve_message = None

def _ui(upward_queue, simulation_to_ui_queue, downward_queue):
    time_start = -1.0
    previous_time = 0.0
    while True:
        message = _receive_message(downward_queue, False)
        if message is not None and message.type == _MESSAGE_STOP:
            break

        if time_start < 0:
            current_time = 0.0
            time_start = time.perf_counter()
        else:
            current_time = time.perf_counter() - time_start

        message = _receive_message(simulation_to_ui_queue, False)

        if current_time > 1.0:
            _LOGGER.debug("asking to quit")
            _send_message(upward_queue, True, _MESSAGE_EVENT,
                          type=_EVENT_QUIT, value=None)
            break

        previous_time = current_time

def _create_process(target, upward_queue, name='', args=None):
    def wrapper(function, upward_queue, *args, **kwargs):
        try:
            function(*args, **kwargs)
        except Exception:
            _send_message(upward_queue, True, _MESSAGE_EVENT,
                          type=_EVENT_ERROR, value=None)
        finally:
            _send_message(upward_queue, True, _MESSAGE_EVENT,
                          type=_EVENT_STOPPED, value=None)
            upward_queue.close()

    process = multiprocessing.Process(
        target=wrapper,
        name=name,
        args=(target, upward_queue) + args,
        kwargs={}
    )
    return process

def _receive_message(q, block):
    try:
        message = q.get(block=block)
    except queue.Empty:
        return None

    return message

def _send_message(q, block, message_type, **kwargs):
    message_value = _MESSAGE_STRUCTS[message_type](**kwargs)
    try:
        q.put(_Message(type=message_type, value=message_value), block=block)
    except queue.Full:
        return _STATUS_FAILURE

    return _STATUS_SUCCESS

def _flush_queue(q):
    try:
        while True:
            q.get(block=False)
    except queue.Empty:
        pass

if __name__ == '__main__':
    run()

关于StackOverflow的相关问题和Python文档中的提示基本上归结为需要在加入进程之前刷新队列，我相信我一直在这里尝试。我意识到，当程序在退出时尝试刷新它们时，模拟队列仍然可能试图将（可能很大的）缓冲数据推送到管道上，从而最终仍然是非空队列。这就是为什么我试图确保在达到这一点之前所有子进程都已停止。现在，查看上面的日志以及取消注释while True循环检查活动进程后输出的其他日志，看起来simulation进程根本不希望完全关闭，即使它的目标功能肯定退出。这可能是我问题的原因吗？

如果是这样，我如何干净地处理它？否则，我在这里失踪了什么？

在Mac OS X 10.9.5上使用Python 3.4进行测试。

PS：我想知道这是否与this bug无关？

Answer 1

听起来这个问题的确是由于推迟数据通过队列的一些延迟，导致刷新失效，因为太早发生了。

一个简单的while process.is_alive(): flush_the_queues()似乎可以解决问题！

即使在刷新队列后也无法加入进程

1 个答案: