我想知道scikit-learn的TfidfVectorizer()及其方法fit_transform / transform是否已经进行了语言预处理,例如小写/词形化/删除标点符号。我正在使用Imdbs评论转储来预测评论是正面还是负面,并且我最初并未进行预处理,但最近开始添加它。使用SGDClassifier,我的准确性得分是84%,但是我是否预处理了它的训练/测试数据集似乎一样吗?
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
lemmatizer = WordNetLemmatizer()
def prepText(reviewText):
soup = BeautifulSoup(reviewText)
reviewText = soup.get_text()
words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(reviewText)]
reviewText = ' '.join([lemmatizer.lemmatize(w) for w in words])
reviewText = reviewText.lower()
reviewText = reviewText.translate(str.maketrans('', '', string.punctuation))
return reviewText
if __name__ == '__main__':
with Pool(12) as p:
X_train = p.map(prepText, X_train)
with Pool(12) as p:
X_test = p.map(prepText, X_test)
print(X_train[10:])
print("Vectorizing")
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=200, tol=None).fit(train_vectors, y_train)
predicted = clf.predict(test_vectors)
#print(accuracy_score(y_test,predicted))
print(metrics.classification_report(y_test, predicted))