word2vec

时间:2017-11-13 18:16:27

标签: python scikit-learn nlp pipeline word2vec

我正在尝试使用多组功能对一组文本文档进行分类。我正在使用sklearn's Feature Union将不同的功能组合到一个模型中。其中一项功能包括使用gensim's word2vec进行单词嵌入。

import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=categories)#dummy dataset

w2v_model= Word2Vec(data .data, size=100, window=5, min_count=5, workers=2)
word2vec={w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)} #dictionary of word embeddings
feat_select = SelectKBest(score_func=chi2, k=10) #other features
TSVD = TruncatedSVD(n_components=50, algorithm = "randomized", n_iter = 5)
#other features

为了包含sklearn中尚未提供的变换器/估计器,我试图将word2vec结果包装成一个返回向量平均值的自定义变换器类。

class w2vTransformer(TransformerMixin):
    """
    Wrapper class for running word2vec into pipelines and FeatureUnions
    """
    def __init__(self,word2vec,**kwargs):
        self.word2vec=word2vec
        self.kwargs=kwargs
        self.dim = len(word2vec.values())
    def fit(self,x, y=None):
        return self

    def transform(self, X):
        return np.array([
        np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
            or [np.zeros(self.dim)], axis=0)
       for words in X
])

然而,当需要适应模型时,我收到错误。

combined_features = FeatureUnion([("w2v_class",w2vTransformer(word2vec)),
     ("feat",feat_select),("TSVD",TSVD)])#join features into combined_features
#combined_features = FeatureUnion([("feat",feat_select),("TSVD",TSVD)])#runs when word embeddings are not included    
text_clf_svm = Pipeline([('vect', CountVectorizer()),
         ('tfidf', TfidfTransformer()),
         ('feature_selection', combined_features),
          ('clf-svm',  SGDClassifier( loss="modified_huber")),
 ]) 

text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data

text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
Traceback (most recent call last):

  File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
    text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
    Xt, fit_params = self._fit(X, y, **fit_params)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
    **fit_params_steps[name])

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
    return self.func(*args, **kwargs)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 739, in fit_transform
    for name, trans, weight in self._iter())

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
    result = ImmediateResult(func)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
    self.results = batch()

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py", line 520, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)

  File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
    for words in X

  File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
    for words in X

  File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
    np.mean([self.word2vec[w] for w in words if w in self.word2vec]

TypeError: unhashable type: 'csr_matrix'

Traceback (most recent call last):

  File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
    text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
    Xt, fit_params = self._fit(X, y, **fit_params)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
    **fit_params_steps[name])

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
    return self.func(*args, **kwargs)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 739, in fit_transform
    for name, trans, weight in self._iter())

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
    result = ImmediateResult(func)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
    self.results = batch()

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)

  File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py", line 520, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)

  File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
    for words in X

  File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
    for words in X

  File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
    np.mean([self.word2vec[w] for w in words if w in self.word2vec]

TypeError: unhashable type: 'csr_matrix'

我理解错误是因为变量&#34;字&#34;是一个csr_matrix,但它需要是一个可迭代的,如列表。我的问题是如何修改变换器类或数据,以便我可以使用单词embeddings作为功能提供给FeatureUnion?这是我的第一篇SO帖子,请温柔。

1 个答案:

答案 0 :(得分:0)

使用Gensim直接提供的新scikit-learn API,您可以避免使用自定义变换器代替自定义变换器! https://radimrehurek.com/gensim/sklearn_api/w2vmodel.html

此外,它取决于您的Gensim版本,但在我的情况下,我可以使用word2vec对象的wv属性解决相同的错误,而不是索引对象本身。

在w2vTransformer类的transform方法中:

self.word2vec.wv[w]

而不是

self.word2vec[w]

希望它有所帮助!