Question

我已经在python上编写了代码，用于电影评论的情感分析

import re
import nltk
from multiprocessing import Pool
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from  sklearn.naive_bayes import GaussianNB
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup



def lemmatize(l):
   # proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip')
    lmtzr = WordNetLemmatizer()
    ''' for i in xrange(0, len(l)):
        for j in xrange(0, len(l[i])):
            l[i][j] = lmtzr.lemmatize(l[i][j])
            l[i][j] = lmtzr.lemmatize(l[i][j],'v')
    '''

    for i in xrange(0, len(l)):
        words = l[i].split()
        word = [lmtzr.lemmatize(lmtzr.lemmatize(w,'v')) for w in words]
        return(" ".join(word))  
    return l


# input:  a list l of string
# output: a list containing the stemmed string in l
def stem(l):
    result = []
    stmr = PorterStemmer()

    for i in xrange(0, len(l)):
        words = l[i].split()
        meaningful = [stmr.stem(w) for w in words]
        l[i] = " ".join( meaningful )

    return l




    return result

# input:  a list l of string
# output: a list of string where the stopwords are removed

def removeStopwords(l):

    stops = set(stopwords.words("english"))

    for i in xrange(0, len(l)):
        words = l[i].lower().split()
        meaningful = [w for w in words if not w in stops]
        l[i] = " ".join( meaningful )

    return l

# input:  a list l of string
# output: a matrix where the (i,j) component is how many times 
#         the j-th word appear in the i-th document
def tf(l):

    result = [[]]
    vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None)                     
    result = vectorizer.fit_transform(l).toarray()
    '''
        your code goes here...
    ''' 

    return result

# input:  a list l of string
# output: a matrix where the (i,j) component is the tf-idf value of the j-th word in the i-th document
def tfidf(l):

    result = [[]]
    tf_ = tf(l)
    #print(tf_[2])
    vectorizer = TfidfVectorizer(smooth_idf = False)
    vectorizer.fit_transform(l)
    idf = vectorizer.idf_
    idf = idf -1 
    # scikit learn idf implementation see line 993 below    
    # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py

   #print(idf[2])
    result = tf_*idf


    return result

# add any additional preprocessing you find helpful
def additional(l):
    result = []

    '''
        your code goes here...
    ''' 

    return result

# input:  a list l of string
# output: a feature matrix like object ready for training (2-D list, numpy array, sparse matrix)
# you may choose to use a subset of the previous functions that work best for you
def preprocess(l):
    print('preprocess done')
    removeStopwords(l)

  #  print(l[1])
    lemmatize(l)
    #stem(l)

    return l

# train_X: feature matrix for training
# train_t: list of labels for training
# val_X: feature matrix for validation
# val_t: list of labels for validation
# just print out your results, no need to return any value
def sec2c(train_X, train_t, val_X, val_t):

    cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    print('LOGREG result:') 
    for c in cvalue:
        logreg = LogisticRegression(C=c)
        a = logreg.fit(train_X, train_t).score(val_X, val_t)
        print(a,c)


    cvalue = [0.001, 0.01, 0.1, 1, 10, 100]
    print('SVM result')
    for c in cvalue:
        svm = LinearSVC(C= c)
        a = svm.fit(train_X, train_t).score(val_X, val_t)
        print(a,c)



    print('NB result')
    array = np.asarray(train_X)
    array[array==0]=1e9
    train_X = array.tolist()

    array = np.asarray(val_X)
    array[array==0]=1e9
    val_X = array.tolist()

    n = int(len(train_X)/8)
    nb = GaussianNB()   
    a = nb.fit(train_X,train_t).score(val_X,val_t)
    print(a)
    return
# input train_text, vali_text, test_text: each being a list of strings
#       train_labels, vali_labels: each being a list of labels
#

def sec2di(train_X, train_t, val_X, val_t, tf= False):

    if tf:
        print('Using TF')
    else:
        print('Using TF-IDF')
    cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    print('LOGREG result:') 
    for c in cvalue:
        logreg = LogisticRegression(C=c)
        a = logreg.fit(train_X, train_t).score(val_X, val_t)
        print(a,c)

def useWord2vec(train_text, train_labels, vali_text, vali_labels, test_text):

 #  from gensim.models import Word2Vec

    # merge your texts here

    # train your word2vec here

    # train your classifiers here
    return 0
def parse(doc,text, label, test= False):

    if test:
        for sentence in doc: 
            review = BeautifulSoup(sentence).get_text()
            if len(review)>0:
                letters_only = re.sub("[^a-zA-Z]"," ",review)
                text.append(letters_only)
    else:
        for sentence in doc: 
            review = BeautifulSoup(sentence).get_text()
            if len(review)>0:
                if review[0:1] == '+':
                    label.append(1)
                else:
                    label.append(-1)    
                review = review[3:]
                letters_only = re.sub("[^a-zA-Z]"," ",review)
                text.append(letters_only)

def main():
# read data and extract texts and labels
    pool = Pool(processes=3)
    train = open('small_train.txt', 'r')

    # do preprocessing
    trainSentences = re.split(r'\n', train.read())
    trainLabel = []
    trainText = []

    valid = open('small_valid.txt', 'r')

    validSentences = re.split(r'\n', valid.read())
    validLabel = []
    validText = []

    test = open('small_test.txt', 'r')    
    testSentences = re.split(r'\n', test.read())
    testLabel = []
    testText = []

    parse(trainSentences, trainText, trainLabel)
    print'parsed train'
    parse(validSentences,validText,validLabel)
    print'parsed valid'
    parse(testSentences,testText,testLabel, test= True)
    print'parsed test'

    pool.map(preprocess, [trainText, validText, testText])
    '''
    preprocess(trainText)
    print('preprocesed train')
    preprocess(validText)
    print('preprocesed valid')
    preprocess(testText)
    print('preprocesed test')   
    '''


    #ts = tfidf(trainText)
   # print(ts[2])
   # print(trainText[1]) 
    # train the model

    # make predictions and save them
    return 0

if __name__ == '__main__':

    main()

但我收到以下错误：

Traceback (most recent call last):   File
"C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py",
line 261, in <module
    main()   File "C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py",
line 222, in main
    valid = open('small_valid.txt', 'r') IOError: [Errno 2] No such file or directory: 'small_valid.txt'

你能帮我解决这个问题吗？

Answer 1

错误很明确：No such file or directory: 'small_valid.txt'。将文件移动到此路径中：

C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master

或更新下一个代码行以使用绝对路径：

train = open('C:\..path_to_file..\small_train.txt', 'r')

valid = open('C:\..path_to_file..\small_valid.txt', 'r')

Python：错误的目录

1 个答案: