我正在尝试使用多组功能对一组文本文档进行分类。我正在使用sklearn's Feature Union将不同的功能组合到一个模型中。其中一项功能包括使用gensim's word2vec进行单词嵌入。
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=categories)#dummy dataset
w2v_model= Word2Vec(data .data, size=100, window=5, min_count=5, workers=2)
word2vec={w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)} #dictionary of word embeddings
feat_select = SelectKBest(score_func=chi2, k=10) #other features
TSVD = TruncatedSVD(n_components=50, algorithm = "randomized", n_iter = 5)
#other features
为了包含sklearn中尚未提供的变换器/估计器,我试图将word2vec结果包装成一个返回向量平均值的自定义变换器类。
class w2vTransformer(TransformerMixin):
"""
Wrapper class for running word2vec into pipelines and FeatureUnions
"""
def __init__(self,word2vec,**kwargs):
self.word2vec=word2vec
self.kwargs=kwargs
self.dim = len(word2vec.values())
def fit(self,x, y=None):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
然而,当需要适应模型时,我收到错误。
combined_features = FeatureUnion([("w2v_class",w2vTransformer(word2vec)),
("feat",feat_select),("TSVD",TSVD)])#join features into combined_features
#combined_features = FeatureUnion([("feat",feat_select),("TSVD",TSVD)])#runs when word embeddings are not included
text_clf_svm = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('feature_selection', combined_features),
('clf-svm', SGDClassifier( loss="modified_huber")),
])
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
Traceback (most recent call last):
File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 739, in fit_transform
for name, trans, weight in self._iter())
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py", line 520, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
TypeError: unhashable type: 'csr_matrix'
Traceback (most recent call last):
File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 739, in fit_transform
for name, trans, weight in self._iter())
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py", line 520, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
TypeError: unhashable type: 'csr_matrix'
我理解错误是因为变量&#34;字&#34;是一个csr_matrix,但它需要是一个可迭代的,如列表。我的问题是如何修改变换器类或数据,以便我可以使用单词embeddings作为功能提供给FeatureUnion?这是我的第一篇SO帖子,请温柔。
答案 0 :(得分:0)
使用Gensim直接提供的新scikit-learn API,您可以避免使用自定义变换器代替自定义变换器! https://radimrehurek.com/gensim/sklearn_api/w2vmodel.html
此外,它取决于您的Gensim版本,但在我的情况下,我可以使用word2vec对象的wv属性解决相同的错误,而不是索引对象本身。
在w2vTransformer类的transform方法中:
self.word2vec.wv[w]
而不是
self.word2vec[w]
希望它有所帮助!