我正在进行情绪分析,并且必须对数据进行正负注释(这是我在数据集中所做的),现在我必须将其转换为小写,然后在训练和测试中拆分数据。
# -*- coding: utf-8 -*-
"""
Created on Wed May 22 03:30:04 2019
@author: bilal
"""
import pandas as pd
#import numpy as np
"""read data"""
datatrain = pd.read_csv('Book2.csv', encoding='latin-1')
#datatest = pd.read_csv('testdata.manual.2009.06.14.csv', encoding='latin-1')
print(datatrain.head())
classes={"neg":1,"pos":2}
C = classes.copy()
#classes={"Negative":0,"Neutral":2,"Positive":1}
for i in range(len(datatrain[1])):
datatrain[1][i]=datatrain[1][i].lower()
datatrain[2][i]=C[datatrain[2][i]]
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(datatrain[1], datatrain[1], test_size=0.2, random_state=50)
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf_matrix = TfidfVectorizer(max_features=5000)
Tfidf_matrix.fit(datatrain[0])
x_train_Tfidf = Tfidf_matrix.transform(x_train)
x_test_Tfidf = Tfidf_matrix.transform(x_test)
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test= Encoder.fit_transform(y_test)
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score