我已经在python上编写了代码,用于电影评论的情感分析
import re
import nltk
from multiprocessing import Pool
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
def lemmatize(l):
# proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip')
lmtzr = WordNetLemmatizer()
''' for i in xrange(0, len(l)):
for j in xrange(0, len(l[i])):
l[i][j] = lmtzr.lemmatize(l[i][j])
l[i][j] = lmtzr.lemmatize(l[i][j],'v')
'''
for i in xrange(0, len(l)):
words = l[i].split()
word = [lmtzr.lemmatize(lmtzr.lemmatize(w,'v')) for w in words]
return(" ".join(word))
return l
# input: a list l of string
# output: a list containing the stemmed string in l
def stem(l):
result = []
stmr = PorterStemmer()
for i in xrange(0, len(l)):
words = l[i].split()
meaningful = [stmr.stem(w) for w in words]
l[i] = " ".join( meaningful )
return l
return result
# input: a list l of string
# output: a list of string where the stopwords are removed
def removeStopwords(l):
stops = set(stopwords.words("english"))
for i in xrange(0, len(l)):
words = l[i].lower().split()
meaningful = [w for w in words if not w in stops]
l[i] = " ".join( meaningful )
return l
# input: a list l of string
# output: a matrix where the (i,j) component is how many times
# the j-th word appear in the i-th document
def tf(l):
result = [[]]
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None)
result = vectorizer.fit_transform(l).toarray()
'''
your code goes here...
'''
return result
# input: a list l of string
# output: a matrix where the (i,j) component is the tf-idf value of the j-th word in the i-th document
def tfidf(l):
result = [[]]
tf_ = tf(l)
#print(tf_[2])
vectorizer = TfidfVectorizer(smooth_idf = False)
vectorizer.fit_transform(l)
idf = vectorizer.idf_
idf = idf -1
# scikit learn idf implementation see line 993 below
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
#print(idf[2])
result = tf_*idf
return result
# add any additional preprocessing you find helpful
def additional(l):
result = []
'''
your code goes here...
'''
return result
# input: a list l of string
# output: a feature matrix like object ready for training (2-D list, numpy array, sparse matrix)
# you may choose to use a subset of the previous functions that work best for you
def preprocess(l):
print('preprocess done')
removeStopwords(l)
# print(l[1])
lemmatize(l)
#stem(l)
return l
# train_X: feature matrix for training
# train_t: list of labels for training
# val_X: feature matrix for validation
# val_t: list of labels for validation
# just print out your results, no need to return any value
def sec2c(train_X, train_t, val_X, val_t):
cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
print('LOGREG result:')
for c in cvalue:
logreg = LogisticRegression(C=c)
a = logreg.fit(train_X, train_t).score(val_X, val_t)
print(a,c)
cvalue = [0.001, 0.01, 0.1, 1, 10, 100]
print('SVM result')
for c in cvalue:
svm = LinearSVC(C= c)
a = svm.fit(train_X, train_t).score(val_X, val_t)
print(a,c)
print('NB result')
array = np.asarray(train_X)
array[array==0]=1e9
train_X = array.tolist()
array = np.asarray(val_X)
array[array==0]=1e9
val_X = array.tolist()
n = int(len(train_X)/8)
nb = GaussianNB()
a = nb.fit(train_X,train_t).score(val_X,val_t)
print(a)
return
# input train_text, vali_text, test_text: each being a list of strings
# train_labels, vali_labels: each being a list of labels
#
def sec2di(train_X, train_t, val_X, val_t, tf= False):
if tf:
print('Using TF')
else:
print('Using TF-IDF')
cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
print('LOGREG result:')
for c in cvalue:
logreg = LogisticRegression(C=c)
a = logreg.fit(train_X, train_t).score(val_X, val_t)
print(a,c)
def useWord2vec(train_text, train_labels, vali_text, vali_labels, test_text):
# from gensim.models import Word2Vec
# merge your texts here
# train your word2vec here
# train your classifiers here
return 0
def parse(doc,text, label, test= False):
if test:
for sentence in doc:
review = BeautifulSoup(sentence).get_text()
if len(review)>0:
letters_only = re.sub("[^a-zA-Z]"," ",review)
text.append(letters_only)
else:
for sentence in doc:
review = BeautifulSoup(sentence).get_text()
if len(review)>0:
if review[0:1] == '+':
label.append(1)
else:
label.append(-1)
review = review[3:]
letters_only = re.sub("[^a-zA-Z]"," ",review)
text.append(letters_only)
def main():
# read data and extract texts and labels
pool = Pool(processes=3)
train = open('small_train.txt', 'r')
# do preprocessing
trainSentences = re.split(r'\n', train.read())
trainLabel = []
trainText = []
valid = open('small_valid.txt', 'r')
validSentences = re.split(r'\n', valid.read())
validLabel = []
validText = []
test = open('small_test.txt', 'r')
testSentences = re.split(r'\n', test.read())
testLabel = []
testText = []
parse(trainSentences, trainText, trainLabel)
print'parsed train'
parse(validSentences,validText,validLabel)
print'parsed valid'
parse(testSentences,testText,testLabel, test= True)
print'parsed test'
pool.map(preprocess, [trainText, validText, testText])
'''
preprocess(trainText)
print('preprocesed train')
preprocess(validText)
print('preprocesed valid')
preprocess(testText)
print('preprocesed test')
'''
#ts = tfidf(trainText)
# print(ts[2])
# print(trainText[1])
# train the model
# make predictions and save them
return 0
if __name__ == '__main__':
main()
但我收到以下错误:
Traceback (most recent call last): File
"C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py",
line 261, in <module
main() File "C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py",
line 222, in main
valid = open('small_valid.txt', 'r') IOError: [Errno 2] No such file or directory: 'small_valid.txt'
你能帮我解决这个问题吗?
答案 0 :(得分:1)
错误很明确:No such file or directory: 'small_valid.txt'
。将文件移动到此路径中:
C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master
或更新下一个代码行以使用绝对路径:
train = open('C:\..path_to_file..\small_train.txt', 'r')
valid = open('C:\..path_to_file..\small_valid.txt', 'r')