我正在运行一组类似熊猫的转换到dask数据帧,使用在我自己的机器上运行的“分布式”设置 - 所以使用对应于我的计算机的8个核心的8个工作人员。
我有分布式客户端的默认设置:
from dask.distributed import Client
c = Client()
该过程使用少量数据(1000条记录)成功运行,但当我将其略微扩展到7500条记录时,我收到以下警告:
tornado.application - ERROR - Exception in callback <bound method Nanny.memory_monitor of <Nanny: tcp://127.0.0.1:58103, threads: 1>>
Traceback (most recent call last):
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 348, in catch_zombie
yield
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
ret = cext.proc_pidtaskinfo_oneshot(self.pid)
ProcessLookupError: [Errno 3] No such process
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/ioloop.py", line 1026, in _run
return self.callback()
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/nanny.py", line 245, in memory_monitor
memory = psutil.Process(self.process.pid).memory_info().rss
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_common.py", line 337, in wrapper
return fun(self)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/__init__.py", line 1049, in memory_info
return self._proc.memory_info()
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 330, in wrapper
return fun(self, *args, **kwargs)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 456, in memory_info
rawtuple = self._get_pidtaskinfo()
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_common.py", line 337, in wrapper
return fun(self)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
ret = cext.proc_pidtaskinfo_oneshot(self.pid)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/contextlib.py", line 99, in __exit__
self.gen.throw(type, value, traceback)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 361, in catch_zombie
raise AccessDenied(proc.pid, proc._name)
psutil._exceptions.AccessDenied: psutil.AccessDenied (pid=17998)
多次重复,因为dask再次尝试启动计算块。在配置文件中指定的次数失败后,最终会出现KilledWorker错误,例如:下面。我尝试过不同长度的数据,而KilledWorker有时会处理融合任务,有时候是在应用任务上。
KilledWorker Traceback (most recent call last)
<ipython-input-28-7ba288919b51> in <module>()
1 #Optional checkpoint to view output
2 with ProgressBar():
----> 3 output = aggdf.compute()
4 output.head()
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
133 dask.base.compute
134 """
--> 135 (result,) = compute(self, traverse=False, **kwargs)
136 return result
137
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
331 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
332 else (None, a) for a in args]
--> 333 results = get(dsk, keys, **kwargs)
334 results_iter = iter(results)
335 return tuple(a if f is None else f(next(results_iter), *a)
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
1997 secede()
1998 try:
-> 1999 results = self.gather(packed, asynchronous=asynchronous)
2000 finally:
2001 for f in futures.values():
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1435 return self.sync(self._gather, futures, errors=errors,
1436 direct=direct, local_worker=local_worker,
-> 1437 asynchronous=asynchronous)
1438
1439 @gen.coroutine
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
590 return future
591 else:
--> 592 return sync(self.loop, func, *args, **kwargs)
593
594 def __repr__(self):
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(1000000)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1313 six.reraise(type(exception),
1314 exception,
-> 1315 traceback)
1316 if errors == 'skip':
1317 bad_keys.add(key)
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
KilledWorker: ("('melt-b85b6b6b1aee5b5aaa8d24db1de65b8a', 0)", 'tcp://127.0.0.1:58108')
我不太熟悉分布式或龙卷风软件包,或者正在创建和终止哪些进程的底层架构 - 是否有人能够指出我正确的方向来调试/解决这个问题?
与此同时,我将切换到多线程计算的默认dask数据帧行为,该行为可以成功处理大量数据。
答案 0 :(得分:3)
看起来你的工人因某种原因正在死亡。不幸的是,工人们不清楚原因是什么。您可以考虑手动设置集群以更清楚地访问工作日志
$ dask-scheduler # run this in one terminal
$ dask-worker tcp://localhost:8786 # run this in another
worker logs will appear here