需要帮助使用计数矢量化器打印预测测试集的标签

时间:2019-07-09 03:15:10

标签: python machine-learning nltk sentiment-analysis countvectorizer

我正在运行以下代码,以基于两个标签Notes and Sentiment(范围为0-4)对注释执行情感分析。我正在尝试与预测一起获得标签(注释)。有人可以帮忙吗?

我尝试正常打印,但只给出数字。

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier




from subprocess import check_output

data = pd.read_csv(r"notes.csv")
# Keeping only the neccessary columns
data = data[['NOTES','SCORE',]]
data['NOTES'] = data['NOTES'].astype(str)
data['SCORE'] = data['SCORE'].astype(str)



# print(data)

# print(data[data['NOTES'].str.contains('See comments below')==True])

#see distribution
print(data['SCORE'].groupby(data['SCORE']).count())

#Feature Generation using Bag of Words


from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(data['NOTES'])


#split into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, data['SCORE'], test_size=0.3, random_state=1)


# Model Building and Evaluation

from sklearn.naive_bayes import MultinomialNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))



#notes
#TF-IDF METHOD:

from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data['NOTES'])


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_tf, data['SCORE'], test_size=0.3, random_state=123)

print(X_test)
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
data['Prediction'] = predicted
print("MultinomialNB Accuracy USING TF-IDF:",metrics.accuracy_score(y_test, predicted))

print(X_test[predicted])

输出应为以下内容: 注释,即“您好,这很好”,预测分数为“ 4”

0 个答案:

没有答案