在管道

时间:2017-08-02 11:23:28

标签: python-3.x gensim text-classification feature-selection

基于这篇文章:http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/我试图在文本分类任务中使用GloVe的预训练向量来实现gensim word2vec模型。但是,我想在我的文本数据中也做FeatureSelection。我在管道中尝试了多个序列但是我得到了一个内存错误,它指向TfidfEmbeddingVectorizer的转换部分。

   return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X

如果我用常规TfIdfVectorizer替换TfidfEmbeddingVectorizer类,它可以正常工作。有没有办法在管道中组合SelectFromModel和W2vec?

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support as score, f1_score
import pickle
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
import gensim
import collections

class ItemSelector(BaseEstimator, TransformerMixin):

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X):
        return (X[self.column])




class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'REPORT_M': text}
                for text in posts]


class TfidfEmbeddingVectorizer(object):
  def __init__(self, word2vec):
    self.word2vec = word2vec
    self.word2weight = None
    self.dim = len(word2vec.values())

  def fit(self, X, y):
    tfidf = TfidfVectorizer(analyzer=lambda x: x)
    tfidf.fit(X)
    # if a word was never seen - it must be at least as infrequent
    # as any of the known words - so the default idf is the max of 
    # known idf's
    max_idf = max(tfidf.idf_)
    self.word2weight = collections.defaultdict(
        lambda: max_idf,
        [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

    return self

  def transform(self, X):
    return np.array([
            np.mean([self.word2vec[w] * self.word2weight[w]
                     for w in words if w in self.word2vec] or
                    [np.zeros(self.dim)], axis=0)
            for words in X
        ])


# training model
 def train(data_train, data_val):

    with open("glove.6B/glove.6B.50d.txt", "rb") as lines:
        w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
               for line in lines}
    classifier = Pipeline([
                    ('union', FeatureUnion([

                            ('text', Pipeline([
                                ('selector', ItemSelector(column='TEXT')),
                                ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
                                ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False),threshold=0.01))
                            ])),

                            ('category', Pipeline([
                                ('selector', ItemSelector(column='category')),
                                ('stats', TextStats()),
                                ('vect', DictVectorizer())
                            ])) 
    ])),
                    ('clf',ExtraTreesClassifier(n_estimators=200, max_depth=500, min_samples_split=6, class_weight= 'balanced'))])

    classifier.fit(data_train,data_train.CLASSES)
    predicted = classifier.predict(data_val)

1 个答案:

答案 0 :(得分:0)

我认为在这里self.dim = len(word2vec.values())你应该指定模型的维度。如果您使用的是glove.6B.50d.txt,则尺寸应为50。

len(word2vec.values())是单词的总数,因此会产生一个巨大的矩阵,即内存错误。