我使用模块multiprocessing
编写了一个程序,该模块全局执行如下:
simulation
和ui
进程。simulation
进程使用新的模拟状态提供队列。如果队列已满,则模拟循环不会被阻止,因此它可以处理可能的传入消息。ui
进程使用模拟队列。ui
进程向主进程发送quit
事件,然后退出循环。退出后,它会通过stopped
的内部_create_process()
功能向主进程发送wrapper()
个事件。quit
事件导致主进程向所有子进程发送stop
信号,而stopped
事件在主循环中递增计数器,这将导致它在收到之后退出许多stopped
事件都有进程。simulation
进程收到stop
事件并退出循环,然后将stopped
事件发送到主进程。stopped
个事件,并得出结论:所有子进程都在被停止的路上。结果,主循环退出run()
函数刷新子进程写入的队列。问题在于,根据下面的日志,尝试加入simulation
进程时,程序会经常(但并非总是)挂起。
[...]
[INFO/ui] process exiting with exitcode 0
[DEBUG/MainProcess] starting thread to feed data to pipe
[DEBUG/MainProcess] ... done self._thread.start()
[DEBUG/simulation] Queue._start_thread()
[DEBUG/simulation] doing self._thread.start()
[DEBUG/simulation] starting thread to feed data to pipe
[DEBUG/simulation] ... done self._thread.start()
[DEBUG/simulation] telling queue thread to quit
[DEBUG/MainProcess] all child processes (2) should have been stopped!
[INFO/simulation] process shutting down
[DEBUG/simulation] running all "atexit" finalizers with priority >= 0
[DEBUG/simulation] telling queue thread to quit
[DEBUG/simulation] running the remaining "atexit" finalizers
[DEBUG/simulation] joining queue thread
[DEBUG/MainProcess] joining process <Process(simulation, started)>
[DEBUG/simulation] feeder thread got sentinel -- exiting
[DEBUG/simulation] ... queue thread joined
[DEBUG/simulation] joining queue thread
通过shell中的Ctrl + C
停止执行会导致这些损坏的回溯:
Process simulation:
Traceback (most recent call last):
Traceback (most recent call last):
File "./debug.py", line 224, in <module>
run()
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/process.py", line 257, in _bootstrap
util._exit_function()
File "./debug.py", line 92, in run
process.join() #< This doesn't work.
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/util.py", line 312, in _exit_function
_run_finalizers()
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/process.py", line 121, in join
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/util.py", line 252, in _run_finalizers
finalizer()
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/util.py", line 185, in __call__
res = self._callback(*self._args, **self._kwargs)
res = self._popen.wait(timeout)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/popen_fork.py", line 54, in wait
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/queues.py", line 196, in _finalize_join
thread.join()
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/multiprocessing/popen_fork.py", line 30, in poll
pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/threading.py", line 1060, in join
self._wait_for_tstate_lock()
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/threading.py", line 1076, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
KeyboardInterrupt
至于代码,这里是它的简化版本(因此它常常看起来不完整):
#!/usr/bin/env python3
import logging
import multiprocessing
import pickle
import queue
import time
from collections import namedtuple
_LOGGER = multiprocessing.log_to_stderr()
_LOGGER.setLevel(logging.DEBUG)
_BUFFER_SIZE = 4
_DATA_LENGTH = 2 ** 12
_STATUS_SUCCESS = 0
_STATUS_FAILURE = 1
_EVENT_ERROR = 0
_EVENT_QUIT = 1
_EVENT_STOPPED = 2
_MESSAGE_STOP = 0
_MESSAGE_EVENT = 1
_MESSAGE_SIMULATION_UPDATE = 2
_Message = namedtuple('_Message', ('type', 'value',))
_StopMessage = namedtuple('_StopMessage', ())
_EventMessage = namedtuple('_EventMessage', ('type', 'value',))
_SimulationUpdateMessage = namedtuple('_SimulationUpdateMessage', ('state',))
_MESSAGE_STRUCTS = {
_MESSAGE_STOP: _StopMessage,
_MESSAGE_EVENT: _EventMessage,
_MESSAGE_SIMULATION_UPDATE: _SimulationUpdateMessage
}
def run():
# Messages from the main process to the child ones.
downward_queue = multiprocessing.Queue()
# Messages from the child processes to the main one.
upward_queue = multiprocessing.Queue()
# Messages from the simulation process to the UI one.
simulation_to_ui_queue = multiprocessing.Queue(maxsize=_BUFFER_SIZE)
# Regroup all the queues that can be written by child processes.
child_process_queues = (upward_queue, simulation_to_ui_queue,)
processes = (
_create_process(
_simulation,
upward_queue,
name='simulation',
args=(
simulation_to_ui_queue,
downward_queue
)
),
_create_process(
_ui,
upward_queue,
name='ui',
args=(
upward_queue,
simulation_to_ui_queue,
downward_queue
)
)
)
try:
for process in processes:
process.start()
_main(downward_queue, upward_queue, len(processes))
finally:
# while True:
# alive_processes = tuple(process for process in processes
# if process.is_alive())
# if not alive_processes:
# break
# _LOGGER.debug("processes still alive: %s" % (alive_processes,))
for q in child_process_queues:
_flush_queue(q)
for process in processes:
_LOGGER.debug("joining process %s" % process)
# process.terminate() #< This works!
process.join() #< This doesn't work.
def _main(downward_queue, upward_queue, process_count):
try:
stopped_count = 0
while True:
message = _receive_message(upward_queue, False)
if message is not None and message.type == _MESSAGE_EVENT:
event_type = message.value.type
if event_type in (_EVENT_QUIT, _EVENT_ERROR):
break
elif event_type == _EVENT_STOPPED:
stopped_count += 1
if stopped_count >= process_count:
break
finally:
# Whatever happens, make sure that all child processes have stopped.
if stopped_count >= process_count:
return
# Send a 'stop' signal to all the child processes.
for _ in range(process_count):
_send_message(downward_queue, True, _MESSAGE_STOP)
while True:
message = _receive_message(upward_queue, False)
if (message is not None
and message.type == _MESSAGE_EVENT
and message.value.type == _EVENT_STOPPED):
stopped_count += 1
if stopped_count >= process_count:
_LOGGER.debug(
"all child processes (%d) should have been stopped!"
% stopped_count
)
break
def _simulation(simulation_to_ui_queue, downward_queue):
simulation_state = [i * 0.123 for i in range(_DATA_LENGTH)]
# When the queue is full (possibly form reaching _BUFFER_SIZE), the next
# solve is computed and kept around until the queue is being consumed.
next_solve_message = None
while True:
message = _receive_message(downward_queue, False)
if message is not None and message.type == _MESSAGE_STOP:
break
if next_solve_message is None:
# _step(simulation_state)
# Somehow the copy (pickle) seems to increase the chances for
# the issue to happen.
next_solve_message = _SimulationUpdateMessage(
state=pickle.dumps(simulation_state)
)
status = _send_message(simulation_to_ui_queue, False,
_MESSAGE_SIMULATION_UPDATE,
**next_solve_message._asdict())
if status == _STATUS_SUCCESS:
next_solve_message = None
def _ui(upward_queue, simulation_to_ui_queue, downward_queue):
time_start = -1.0
previous_time = 0.0
while True:
message = _receive_message(downward_queue, False)
if message is not None and message.type == _MESSAGE_STOP:
break
if time_start < 0:
current_time = 0.0
time_start = time.perf_counter()
else:
current_time = time.perf_counter() - time_start
message = _receive_message(simulation_to_ui_queue, False)
if current_time > 1.0:
_LOGGER.debug("asking to quit")
_send_message(upward_queue, True, _MESSAGE_EVENT,
type=_EVENT_QUIT, value=None)
break
previous_time = current_time
def _create_process(target, upward_queue, name='', args=None):
def wrapper(function, upward_queue, *args, **kwargs):
try:
function(*args, **kwargs)
except Exception:
_send_message(upward_queue, True, _MESSAGE_EVENT,
type=_EVENT_ERROR, value=None)
finally:
_send_message(upward_queue, True, _MESSAGE_EVENT,
type=_EVENT_STOPPED, value=None)
upward_queue.close()
process = multiprocessing.Process(
target=wrapper,
name=name,
args=(target, upward_queue) + args,
kwargs={}
)
return process
def _receive_message(q, block):
try:
message = q.get(block=block)
except queue.Empty:
return None
return message
def _send_message(q, block, message_type, **kwargs):
message_value = _MESSAGE_STRUCTS[message_type](**kwargs)
try:
q.put(_Message(type=message_type, value=message_value), block=block)
except queue.Full:
return _STATUS_FAILURE
return _STATUS_SUCCESS
def _flush_queue(q):
try:
while True:
q.get(block=False)
except queue.Empty:
pass
if __name__ == '__main__':
run()
关于StackOverflow的相关问题和Python文档中的提示基本上归结为需要在加入进程之前刷新队列,我相信我一直在这里尝试。我意识到,当程序在退出时尝试刷新它们时,模拟队列仍然可能试图将(可能很大的)缓冲数据推送到管道上,从而最终仍然是非空队列。这就是为什么我试图确保在达到这一点之前所有子进程都已停止。现在,查看上面的日志以及取消注释while True
循环检查活动进程后输出的其他日志,看起来simulation
进程根本不希望完全关闭,即使它的目标功能肯定退出。这可能是我问题的原因吗?
如果是这样,我如何干净地处理它?否则,我在这里失踪了什么?
在Mac OS X 10.9.5上使用Python 3.4进行测试。
PS:我想知道这是否与this bug无关?
答案 0 :(得分:0)
听起来这个问题的确是由于推迟数据通过队列的一些延迟,导致刷新失效,因为太早发生了。
一个简单的while process.is_alive(): flush_the_queues()
似乎可以解决问题!