我正在尝试在我们的HPC集群上运行分布式超参数优化。这段代码在sklearn上运行正常(缓慢),但是使用内置的dask功能,工作人员会立即因关键错误而失败。
client = Client('<ip>')
import numpy as np
import dask.array as da
X_train = np.load(<some numpy x binary>)
Y_train = np.load(<some numpy y binary>)
X_train = da.from_array(X_train, chunks=50000)
Y_train = da.from_array(Y_train, chunks=50000)
#from xgboost import XGBClassifier # These work
from dask_xgboost import XGBClassifier
#from sklearn.model_selection import GridSearchCV # These work
from dask_ml.model_selection import GridSearchCV
opt_clf = XGBClassifier()
hyparams={}
for par, val in param.items():
hyparams[par]=[val]
hyparams["eta"] = [0.2,0.5,0.9]
search = GridSearchCV(opt_clf, hyparams, scoring='f1_macro')
任务已分配给工人,我可以看到他们身上保留着的记忆,但是所有火车作业都失败了:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-12-cfe3d66f9554> in <module>
1 get_ipython().run_line_magic('time', '')
----> 2 search.fit(X_train, Y_train)
3 search.cv_results_
~/.local/lib/python3.6/site-packages/dask_ml/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
1264 else:
1265 logger.warning("{} has failed... retrying".format(future.key))
-> 1266 future.retry()
1267 ac.add(future)
1268
~/.local/lib/python3.6/site-packages/distributed/client.py in retry(self, **kwargs)
310 Client.retry
311 """
--> 312 return self.client.retry([self], **kwargs)
313
314 def cancelled(self):
~/.local/lib/python3.6/site-packages/distributed/client.py in retry(self, futures, asynchronous)
2234 futures: list of Futures
2235 """
-> 2236 return self.sync(self._retry, futures, asynchronous=asynchronous)
2237
2238 async def _publish_dataset(self, *args, name=None, **kwargs):
~/.local/lib/python3.6/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
831 else:
832 return sync(
--> 833 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
834 )
835
~/.local/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
337 if error[0]:
338 typ, exc, tb = error[0]
--> 339 raise exc.with_traceback(tb)
340 else:
341 return result[0]
~/.local/lib/python3.6/site-packages/distributed/utils.py in f()
321 if callback_timeout is not None:
322 future = asyncio.wait_for(future, callback_timeout)
--> 323 result[0] = yield future
324 except Exception as exc:
325 error[0] = sys.exc_info()
~/.local/lib/python3.6/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
~/.local/lib/python3.6/site-packages/distributed/client.py in _retry(self, futures)
2223 response = await self.scheduler.retry(keys=keys, client=self.id)
2224 for key in response:
-> 2225 st = self.futures[key]
2226 st.retry()
2227
KeyError: 'xgbclassifier-a93a193c47acb611cde5ecdeb7347809'
有人对此问题有任何提示或调试方法吗?似乎工人只是没有返回经过训练的模型。
谢谢