我使用doc2vec训练标记句子的模型,然后可以用于其他句子的多类分类。
我能够对句子进行矢量化,但是当我尝试训练模型时,我现在遇到错误。
ValueError:未知的标签类型:'未知'
我对此非常陌生,但在搜索其他帖子后,看起来它与我的y值不是一个数组有关。但我不知道如何解决这个问题。有人可以建议解决这个问题吗?
以下是我的代码的相关部分:
import pandas as pd
import numpy as np
np.random.seed(0)
def read_text_file(f):
df_complete = pd.read_csv(f)
df = df_complete.loc[: , ["Text", "Score"]]
df.dropna(how = "any", inplace = True)
return df
df = read_text_file("input/Reviews.csv")
print(df.head())
def sampling_dataset(df):
count = 5000
class_df_sampled = pd.DataFrame(columns = ["Score", "Text"])
temp = []
for c in df.Score.unique():
class_indexes = df[df.Score == c].index
random_indexes = np.random.choice(class_indexes, count, replace = False)
temp.append(df.loc[random_indexes])
for each_df in temp:
class_df_sampled = pd.concat([class_df_sampled, each_df], axis = 0)
return class_df_sampled
df = sampling_dataset(df)
df.reset_index(drop = True, inplace = True)
print(df.head())
print(df.shape)
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re
lmtzr = WordNetLemmatizer()
w = re.compile("\w+", re.I)
def label_sentences(df):
labeled_sentences = []
for index, datapoint in df.iterrows():
tokenized_words = re.findall(w, datapoint["Text"].lower())
labeled_sentences.append(LabeledSentence(words = tokenized_words, tags = ['SENT_%s' % index]))
return labeled_sentences
def train_doc2vec_model(labeled_sentences):
model = Doc2Vec(alpha = 0.025, min_alpha = 0.025)
model.build_vocab(labeled_sentences)
for epoch in range(10):
model.train(labeled_sentences, total_examples = 25000, epochs = 10)
model.alpha -= 0.002
model.min_alpha = model.alpha
return model
sen = label_sentences(df)
model = train_doc2vec_model(sen)
def vectorize_comments(df, d2v_model):
y = []
comments = []
for i in range(0, df.shape[0]):
label = 'SENT_%s' % i
comments.append(d2v_model.docvecs[label])
df['vectorized_comments'] = comments
return df
df = vectorize_comments(df, model)
print(df.head(2))
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
import pickle
def train_classifier(X, y):
n_estimators = [200, 400]
min_samples_split = [2]
min_samples_leaf = [1]
bootstrap = [True]
parameters = {
'n_estimators': n_estimators,
'min_samples_leaf': min_samples_leaf,
'min_samples_split': min_samples_split
}
clf = GridSearchCV(RFC(verbose = 1, n_jobs = 4), cv = 4, param_grid = parameters)
clf.fit(X, y)
return clf
X_train, X_test, y_train, y_test = cross_validation.train_test_split(df['vectorized_comments'].T.tolist(), df['Score'], test_size = 0.02, random_state = 17)
classifier = train_classifier(X_train, y_train)
print(classifier.best_score_, "----------------Best Accuracy score on Cross Validation Sets")
print(classifier.score(X_test, y_test))
f = open("Output.txt", "w")
f.write("Best Accuracy score on Cross Validation Sets %f" % classifier.best_score_, )
f.write("Score on Test Set %f" % classifier.score(X_test, y_test))
f.close()

这是完整的堆栈错误:
Traceback (most recent call last):
File "<ipython-input-4-a9ad2a977535>", line 1, in <module>
runfile('C:/Users/user/.spyder-py3/multiclass doc2vec.py', wdir='C:/Users/user/.spyder-py3')
File "C:\Users\user\Anaconda31\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\user\Anaconda31\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/user/.spyder-py3/multiclass doc2vec.py", line 105, in <module>
classifier = train_classifier(X_train,y_train)
File "C:/Users/user/.spyder-py3/multiclass doc2vec.py", line 101, in train_classifier
clf.fit(X, y)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\grid_search.py", line 838, in fit
return self._fit(X, y, ParameterGrid(self.param_grid))
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\grid_search.py", line 574, in _fit
for parameters in parameter_iterable
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\cross_validation.py", line 1675, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\ensemble\forest.py", line 273, in fit
y, expanded_class_weight = self._validate_y_class_weight(y)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\ensemble\forest.py", line 471, in _validate_y_class_weight
check_classification_targets(y)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\utils\multiclass.py", line 172, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'unknown'
&#13;