我正在尝试使用较慢的后端运行sklearn,但在以下示例中使用make_union时出现错误:
from distributed.joblib import DistributedBackend
from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'sci.space'])
register_parallel_backend('distributed', DistributedBackend)
with parallel_backend('distributed', scheduler_host='scheduler:8786'):
text_features = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2), analyzer='word')
tc = make_pipeline(text_features, LogisticRegression())
tc.fit(newsgroups_train.data, newsgroups_train.target)
newsgroups_test = fetch_20newsgroups(subset='test', categories=['alt.atheism', 'sci.space'])
pred = tc.predict(newsgroups_test.data)
print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))
with parallel_backend('distributed', scheduler_host='scheduler:8786'):
text_features = make_union(TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2), analyzer='word'),
TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(3, 4), analyzer='char'))
tc = make_pipeline(text_features, LogisticRegression())
tc.fit(newsgroups_train.data, newsgroups_train.target)
pred = tc.predict(newsgroups_test.data)
print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))
脚本在dask上下文中成功运行了第一个fit()函数。它还成功地为0.962817509347
产生了输出print(metrics.f1_score(...))
,但是在第二个模糊上下文中,它产生了以下错误:
---------------------------------------------------------------------------
CancelledError Traceback (most recent call last)
<ipython-input-19-246fd63d77f6> in <module>()
28 TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(3, 4), analyzer='char'))
29 tc = make_pipeline(text_features2, LogisticRegression())
---> 30 tc.fit(newsgroups_train.data, newsgroups_train.target)
31
32 pred = tc.predict(newsgroups_test.data)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
255 This estimator
256 """
--> 257 Xt, fit_params = self._fit(X, y, **fit_params)
258 if self._final_estimator is not None:
259 self._final_estimator.fit(Xt, y, **fit_params)
...
~/anaconda3/envs/datascience/lib/python3.6/site-packages/distributed/client.py in result(self, timeout)
158 six.reraise(*result)
159 elif self.status == 'cancelled':
--> 160 raise result
161 else:
162 return result
CancelledError: _fit_transform_one-batch-7aa7d6e212804fd38aec8a7f037c8d25
它还会产生以下错误日志:
distributed.client - WARNING - Client report stream closed to scheduler
tornado.application - ERROR - Exception in callback functools.partial(<function wrap.<locals>.null_wrapper at 0x7f5efee4a7b8>, <tornado.concurrent.Future object at 0x7f630cf95518>)
Traceback (most recent call last):
File "/root/anaconda3/envs/datascience/lib/python3.6/site-packages/tornado/ioloop.py", line 605, in _run_callback
ret = callback()
...
File "<string>", line 4, in raise_exc_info
File "/root/anaconda3/envs/datascience/lib/python3.6/site-packages/tornado/gen.py", line 1069, in run
yielded = self.gen.send(value)
File "/root/anaconda3/envs/datascience/lib/python3.6/site-packages/distributed/client.py", line 2825, in _wait
raise CancelledError(cancelled)
concurrent.futures._base.CancelledError: ['_fit_transform_one-batch-3b14ae7acf12489c8a5a7d8ef899eb1c']
dask调度程序还会产生以下错误输出:
scheduler_1 | KeyError: None
scheduler_1 | distributed.core - ERROR - None
scheduler_1 | Traceback (most recent call last):
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 386, in handle_stream
scheduler_1 | msgs = yield comm.read()
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 729, in run
scheduler_1 | value = future.result()
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 736, in run
scheduler_1 | yielded = self.gen.throw(*exc_info) # type: ignore
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/distributed/comm/tcp.py", line 206, in read
scheduler_1 | deserializers=deserializers)
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 729, in run
scheduler_1 | value = future.result()
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
scheduler_1 | yielded = next(result)
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/distributed/comm/utils.py", line 82, in from_frames
scheduler_1 | res = _from_frames()
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/distributed/comm/utils.py", line 68, in _from_frames
scheduler_1 | deserializers=deserializers)
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/distributed/protocol/core.py", line 132, in loads
scheduler_1 | value = _deserialize(head, fs, deserializers=deserializers)
scheduler_1 | File "/opt/conda/lib/python3.7/site-packages/distributed/protocol/serialize.py", line 186, in deserialize
scheduler_1 | dumps, loads, wants_context = families[name]
scheduler_1 | KeyError: None
scheduler_1 | distributed.scheduler - INFO - Remove client Client-b7ab33f8-419b-11e9-809a-0242ac130004
scheduler_1 | distributed.scheduler - INFO - Close client connection: Client-b7ab33f8-419b-11e9-809a-0242ac130004
在dask中使用make_union()是否存在问题?有什么想法吗?我在一个由工人,调度程序和笔记本组成的docker组成环境中的jupyter笔记本中运行代码。笔记本泊坞窗已配置了conda(如果可能会影响到此结果)。
我可以通过将make_union()
替换为以下union_make()
函数来使脚本正常工作
from scipy import sparse
class union_make:
def __init__(self, *args):
self.parts = args
def fit(self, X, y=None):
for p in self.parts:
p.fit(X, y)
return self
def transform(self, X, y=None):
Xs = [p.transform(X, y) for p in self.parts]
if any(sparse.issparse(f) for f in Xs):
Xs = sparse.hstack(Xs).tocsr()
else:
Xs = np.hstack(Xs)
return Xs
def fit_transform(self, X, y=None):
return self.fit(X,y).transform(X,y)