Python Doc2Vec sklearn ValueError未知

时间:2018-04-15 14:58:57

标签: python scikit-learn doc2vec

我使用doc2vec训练标记句子的模型,然后可以用于其他句子的多类分类。

我能够对句子进行矢量化,但是当我尝试训练模型时,我现在遇到错误。

  

ValueError:未知的标签类型:'未知'

我对此非常陌生,但在搜索其他帖子后,看起来它与我的y值不是一个数组有关。但我不知道如何解决这个问题。有人可以建议解决这个问题吗?

以下是我的代码的相关部分:



import pandas as pd
import numpy as np

np.random.seed(0)

def read_text_file(f):
  df_complete = pd.read_csv(f)
df = df_complete.loc[: , ["Text", "Score"]]
df.dropna(how = "any", inplace = True)
return df

df = read_text_file("input/Reviews.csv")
print(df.head())

def sampling_dataset(df):
    count = 5000
    class_df_sampled = pd.DataFrame(columns = ["Score", "Text"])
    temp = []
    for c in df.Score.unique():
        class_indexes = df[df.Score == c].index
        random_indexes = np.random.choice(class_indexes, count, replace = False)
        temp.append(df.loc[random_indexes])

    for each_df in temp:
        class_df_sampled = pd.concat([class_df_sampled, each_df], axis = 0)

    return class_df_sampled

df = sampling_dataset(df)
df.reset_index(drop = True, inplace = True)
print(df.head())
print(df.shape)

from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re

lmtzr = WordNetLemmatizer()
w = re.compile("\w+", re.I)

def label_sentences(df):
    labeled_sentences = []
    for index, datapoint in df.iterrows():
        tokenized_words = re.findall(w, datapoint["Text"].lower())
        labeled_sentences.append(LabeledSentence(words = tokenized_words, tags = ['SENT_%s' % index]))
    return labeled_sentences

def train_doc2vec_model(labeled_sentences):
    model = Doc2Vec(alpha = 0.025, min_alpha = 0.025)
    model.build_vocab(labeled_sentences)
    for epoch in range(10):
        model.train(labeled_sentences, total_examples = 25000, epochs = 10)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

    return model

sen = label_sentences(df)
model = train_doc2vec_model(sen)

def vectorize_comments(df, d2v_model):
    y = []
    comments = []
    for i in range(0, df.shape[0]):
        label = 'SENT_%s' % i
        comments.append(d2v_model.docvecs[label])
    df['vectorized_comments'] = comments

    return df

df = vectorize_comments(df, model)
print(df.head(2))

from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
import warnings 

warnings.filterwarnings("ignore", category = DeprecationWarning)

import pickle

def train_classifier(X, y):
    n_estimators = [200, 400]
    min_samples_split = [2]
    min_samples_leaf = [1]
    bootstrap = [True]

    parameters = {
      'n_estimators': n_estimators,
      'min_samples_leaf': min_samples_leaf,
      'min_samples_split': min_samples_split
    }

    clf = GridSearchCV(RFC(verbose = 1, n_jobs = 4), cv = 4, param_grid = parameters)
    clf.fit(X, y)
    return clf

X_train, X_test, y_train, y_test = cross_validation.train_test_split(df['vectorized_comments'].T.tolist(), df['Score'], test_size = 0.02, random_state = 17)
classifier = train_classifier(X_train, y_train)
print(classifier.best_score_, "----------------Best Accuracy score on Cross Validation Sets")
print(classifier.score(X_test, y_test))


f = open("Output.txt", "w")
f.write("Best Accuracy score on Cross Validation Sets %f" % classifier.best_score_, )
f.write("Score on Test Set %f" % classifier.score(X_test, y_test))
f.close()




这是完整的堆栈错误:



Traceback (most recent call last):

  File "<ipython-input-4-a9ad2a977535>", line 1, in <module>
    runfile('C:/Users/user/.spyder-py3/multiclass doc2vec.py', wdir='C:/Users/user/.spyder-py3')

  File "C:\Users\user\Anaconda31\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
    execfile(filename, namespace)

  File "C:\Users\user\Anaconda31\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)

  File "C:/Users/user/.spyder-py3/multiclass doc2vec.py", line 105, in <module>
    classifier = train_classifier(X_train,y_train)

  File "C:/Users/user/.spyder-py3/multiclass doc2vec.py", line 101, in train_classifier
    clf.fit(X, y)

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\grid_search.py", line 838, in fit
    return self._fit(X, y, ParameterGrid(self.param_grid))

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\grid_search.py", line 574, in _fit
    for parameters in parameter_iterable

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
    result = ImmediateResult(func)

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
    self.results = batch()

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\cross_validation.py", line 1675, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\ensemble\forest.py", line 273, in fit
    y, expanded_class_weight = self._validate_y_class_weight(y)

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\ensemble\forest.py", line 471, in _validate_y_class_weight
    check_classification_targets(y)

  File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\utils\multiclass.py", line 172, in check_classification_targets
    raise ValueError("Unknown label type: %r" % y_type)

ValueError: Unknown label type: 'unknown'
&#13;
&#13;
&#13;

0 个答案:

没有答案