Sklearn make_union和达

时间:2019-03-08 12:35:56

标签: scikit-learn dask

我正在尝试使用较慢的后端运行sklearn,但在以下示例中使用make_union时出现错误:

from distributed.joblib import DistributedBackend 
from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics


from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'sci.space'])

register_parallel_backend('distributed', DistributedBackend)

with parallel_backend('distributed', scheduler_host='scheduler:8786'):
    text_features = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2), analyzer='word')
    tc = make_pipeline(text_features, LogisticRegression())
    tc.fit(newsgroups_train.data, newsgroups_train.target)

newsgroups_test = fetch_20newsgroups(subset='test', categories=['alt.atheism', 'sci.space']) 
pred = tc.predict(newsgroups_test.data)
print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))

with parallel_backend('distributed', scheduler_host='scheduler:8786'):
    text_features = make_union(TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2), analyzer='word'),
                                TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(3, 4), analyzer='char'))
    tc = make_pipeline(text_features, LogisticRegression())
    tc.fit(newsgroups_train.data, newsgroups_train.target)

pred = tc.predict(newsgroups_test.data)
print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))

脚本在dask上下文中成功运行了第一个fit()函数。它还成功地为0.962817509347产生了输出print(metrics.f1_score(...)),但是在第二个模糊上下文中,它产生了以下错误:

---------------------------------------------------------------------------
CancelledError                            Traceback (most recent call last)
<ipython-input-19-246fd63d77f6> in <module>()
     28                                 TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(3, 4), analyzer='char'))
     29     tc = make_pipeline(text_features2, LogisticRegression())
---> 30     tc.fit(newsgroups_train.data, newsgroups_train.target)
     31 
     32 pred = tc.predict(newsgroups_test.data)

~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    255             This estimator
    256         """
--> 257         Xt, fit_params = self._fit(X, y, **fit_params)
    258         if self._final_estimator is not None:
    259             self._final_estimator.fit(Xt, y, **fit_params)
...
~/anaconda3/envs/datascience/lib/python3.6/site-packages/distributed/client.py in result(self, timeout)
    158             six.reraise(*result)
    159         elif self.status == 'cancelled':
--> 160             raise result
    161         else:
    162             return result

CancelledError: _fit_transform_one-batch-7aa7d6e212804fd38aec8a7f037c8d25

它还会产生以下错误日志:

distributed.client - WARNING - Client report stream closed to scheduler
tornado.application - ERROR - Exception in callback functools.partial(<function wrap.<locals>.null_wrapper at 0x7f5efee4a7b8>, <tornado.concurrent.Future object at 0x7f630cf95518>)
Traceback (most recent call last):
  File "/root/anaconda3/envs/datascience/lib/python3.6/site-packages/tornado/ioloop.py", line 605, in _run_callback
    ret = callback()
...
  File "<string>", line 4, in raise_exc_info
  File "/root/anaconda3/envs/datascience/lib/python3.6/site-packages/tornado/gen.py", line 1069, in run
    yielded = self.gen.send(value)
  File "/root/anaconda3/envs/datascience/lib/python3.6/site-packages/distributed/client.py", line 2825, in _wait
    raise CancelledError(cancelled)
concurrent.futures._base.CancelledError: ['_fit_transform_one-batch-3b14ae7acf12489c8a5a7d8ef899eb1c']

dask调度程序还会产生以下错误输出:

scheduler_1  | KeyError: None
scheduler_1  | distributed.core - ERROR - None
scheduler_1  | Traceback (most recent call last):
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/distributed/core.py", line 386, in handle_stream
scheduler_1  |     msgs = yield comm.read()
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 729, in run
scheduler_1  |     value = future.result()
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 736, in run
scheduler_1  |     yielded = self.gen.throw(*exc_info)  # type: ignore
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/distributed/comm/tcp.py", line 206, in read
scheduler_1  |     deserializers=deserializers)
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 729, in run
scheduler_1  |     value = future.result()
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
scheduler_1  |     yielded = next(result)
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/distributed/comm/utils.py", line 82, in from_frames
scheduler_1  |     res = _from_frames()
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/distributed/comm/utils.py", line 68, in _from_frames
scheduler_1  |     deserializers=deserializers)
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/distributed/protocol/core.py", line 132, in loads
scheduler_1  |     value = _deserialize(head, fs, deserializers=deserializers)
scheduler_1  |   File "/opt/conda/lib/python3.7/site-packages/distributed/protocol/serialize.py", line 186, in deserialize
scheduler_1  |     dumps, loads, wants_context = families[name]
scheduler_1  | KeyError: None
scheduler_1  | distributed.scheduler - INFO - Remove client Client-b7ab33f8-419b-11e9-809a-0242ac130004
scheduler_1  | distributed.scheduler - INFO - Close client connection: Client-b7ab33f8-419b-11e9-809a-0242ac130004

在dask中使用make_union()是否存在问题?有什么想法吗?我在一个由工人,调度程序和笔记本组成的docker组成环境中的jupyter笔记本中运行代码。笔记本泊坞窗已配置了conda(如果可能会影响到此结果)。

我可以通过将make_union()替换为以下union_make()函数来使脚本正常工作

from scipy import sparse

class union_make:
    def __init__(self, *args):
        self.parts = args

    def fit(self, X, y=None):
        for p in self.parts:
            p.fit(X, y)
        return self

    def transform(self, X, y=None):
        Xs = [p.transform(X, y) for p in self.parts]
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = np.hstack(Xs)
        return Xs

    def fit_transform(self, X, y=None):
        return self.fit(X,y).transform(X,y)

0 个答案:

没有答案