NLTK朴素贝叶斯分类器情绪不正确的特征选择

时间:2014-09-29 10:29:59

标签: python nltk

我一直在重复使用以下代码,但是对于我的输出“最具信息性的功能”,我得到了错误标记的功能。您是否认为这是我(自制)语料库中的数据编码问题?

import csv
import nltk
from nltk.classify.util import apply_features
from nltk.corpus import stopwords
import math
import re
import sys
import os
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')

customstopwords = ['show', 'they', 'them','He','She','We','i','are','this','the','so','to','me','for','and','was','in','as','about']

#Loads the sentiment files
p = open('Positivetweets50.txt', 'r')
postxt = p.readlines()

n = open('Negativetweets50.txt', 'r')
negtxt = n.readlines()

neglist = []
poslist = []

#creates a list of sentiment files with the same length of the sentiment tweet list.

for i in range(0,len(negtxt)):
    neglist.append('negative')


for i in range(0,len(postxt)):
    poslist.append('positive')

#creates a tuple list with sentiment tagged at the end of sentences.
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)

#appends all the tagged tweets to a common list
taggedtweets = postagged + negtagged

print taggedtweets 

tweets = []

#creates a list of words with sentiments.
for (word, sentiment) in taggedtweets:
    word_filter = [i.lower() for i in word.split()]
    tweets.append((word_filter, sentiment))

#Pulls out all the words in a list of tagged tweets.
def getwords(tweets):
    allwords = []
    for (words, sentiment) in tweets:
        allwords.extend(words)
    return allwords

#uses nltk library to order the list of tweets words pulled out by their frequency.
def getwordfeatures(listoftweets):
    wordfreq = nltk.FreqDist(listoftweets)
    words = wordfreq.keys()
    return words    

#calls the baove functions to provide the list of words excluding the custom and stop words, ordered by frequency

print getwordfeatures(getwords(tweets))

wordlist = getwordfeatures(getwords(tweets))

def feature_extractor(doc):
    docwords = set(doc)
    features = {}
    for i in wordlist:
        features['contains(%s)' % i] = (i in docwords)
    return features

#creates the training set to classify on the basis of distribution of true and false in the input.
training_set = nltk.classify.util.apply_features(feature_extractor, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set) 

print classifier.show_most_informative_features(n=1000)
print 'accuracy:', nltk.classify.util.accuracy(classifier, training_set)

输出:

Most Informative Features
           contains(tom) = True           negati : positi =      1.0 : 1.0
        contains(thrown) = True           negati : positi =      1.0 : 1.0
     contains("""joined) = True           negati : positi =      1.0 : 1.0
         contains(tokyo) = True           negati : positi =      1.0 : 1.0
 contains(@christophery) = True           negati : positi =      1.0 : 1.0
         contains(won't) = True           negati : positi =      1.0 : 1.0
contains("""@edisonneil) = True           negati : positi =      1.0 : 1.0
     contains(husband's) = True           negati : positi =      1.0 : 1.0
        contains(come!!) = True           negati : positi =      1.0 : 1.0
       contains(hair!!!) = True           negati : positi =      1.0 : 1.0
    contains(accountant) = True           negati : positi =      1.0 : 1.0
       contains(giggles) = True           negati : positi =      1.0 : 1.0
        contains(bigger) = True           negati : positi =      1.0 : 1.0
         contains(that?) = True           negati : positi =      1.0 : 1.0
        contains(they'd) = True           negati : positi =      1.0 : 1.0
 contains("""@jerinelay) = True           negati : positi =      1.0 : 1.0
      contains(launched) = True           negati : positi =      1.0 : 1.0
          contains(nina) = True           negati : positi =      1.0 : 1.0
           contains(htc) = True           negati : positi =      1.0 : 1.0
         contains(hmmmm) = True           negati : positi =      1.0 : 1.0
   contains("""@chele76) = True           negati : positi =      1.0 : 1.0
        contains(buying) = True           negati : positi =      1.0 : 1.0
       contains(teaches) = True           negati : positi =      1.0 : 1.0
        contains(heaven) = True           negati : positi =      1.0 : 1.0
          contains(old!) = True           negati : positi =      1.0 : 1.0
      contains(flipping) = True           negati : positi =      1.0 : 1.0
           contains(cal) = True           negati : positi =      1.0 : 1.0
     contains(roosevelt) = True           negati : positi =      1.0 : 1.0
           contains(wat) = True           negati : positi =      1.0 : 1.0
         contains(tribe) = True           negati : positi =      1.0 : 1.0
           contains(be!) = True           negati : positi =      1.0 : 1.0
    contains("""amazing) = True           negati : positi =      1.0 : 1.0
        contains(stairs) = True           negati : positi =      1.0 : 1.0
      contains(podcasts) = True           negati : positi =      1.0 : 1.0
         contains(pound) = True           negati : positi =      1.0 : 1.0
   contains(tomorrow...) = True           negati : positi =      1.0 : 1.0
       contains(months!) = True           negati : positi =      1.0 : 1.0
          contains(wana) = True           negati : positi =      1.0 : 1.0
        contains(impact) = True           negati : positi =      1.0 : 1.0
        contains(texted) = True           negati : positi =      1.0 : 1.0
       contains(vampire) = True           negati : positi =      1.0 : 1.0
contains("""@dionrodrigues) = True           negati : positi =      1.0 : 1.0
          contains(kind) = True           negati : positi =      1.0 : 1.0
       contains(sheesh.) = True           negati : positi =      1.0 : 1.0
     contains(pictures.) = True           negati : positi =      1.0 : 1.0
        contains(breeze) = True           negati : positi =      1.0 : 1.0
    contains(@amrosario) = True           negati : positi =      1.0 : 1.0
        contains(wells.) = True           negati : positi =      1.0 : 1.0
          contains(gave) = True           negati : positi =      1.0 : 1.0
         contains(soul.) = True           negati : positi =      1.0 : 1.0
          contains(addy) = True           negati : positi =      1.0 : 1.0
       contains(soooooo) = True           negati : positi =      1.0 : 1.0
        contains("""@j") = True           negati : positi =      1.0 : 1.0
           contains(coz) = True           negati : positi =      1.0 : 1.0
         contains(quick) = True           negati : positi =      1.0 : 1.0
          contains(did.) = True           negati : positi =      1.0 : 1.0
        contains(humor.) = True           negati : positi =      1.0 : 1.0
       contains(@b_club) = True           negati : positi =      1.0 : 1.0
contains("""@julieunplugged) = True           negati : positi =      1.0 : 1.0
          contains(fire) = True           negati : positi =      1.0 : 1.0
       contains(@angusi) = True           negati : positi =      1.0 : 1.0
          contains(bff.) = True           negati : positi =      1.0 : 1.0
         contains(page.) = True           negati : positi =      1.0 : 1.0
       contains(took""") = True           negati : positi =      1.0 : 1.0
      contains(returned) = True           negati : positi =      1.0 : 1.0
        contains(hello!) = True           negati : positi =      1.0 : 1.0
    contains(friday!!!!) = True           negati : positi =      1.0 : 1.0
     contains(creepy""") = True           negati : positi =      1.0 : 1.0
   contains(farewell""") = True           negati : positi =      1.0 : 1.0
     contains(awsome""") = True           negati : positi =      1.0 : 1.0
        contains(late..) = True           negati : positi =      1.0 : 1.0
   contains(@calmbanana) = True           negati : positi =      1.0 : 1.0
          contains(huge) = True           negati : positi =      1.0 : 1.0
        contains(window) = True           negati : positi =      1.0 : 1.0
      contains(complete) = True           negati : positi =      1.0 : 1.0
     contains(question?) = True           negati : positi =      1.0 : 1.0
       contains(from""") = True           negati : positi =      1.0 : 1.0
       contains("""baby) = True           negati : positi =      1.0 : 1.0
        contains(right.) = True           negati : positi =      1.0 : 1.0
     contains(delicious) = True           negati : positi =      1.0 : 1.0
     contains(unreal""") = True           negati : positi =      1.0 : 1.0
         contains(voted) = True           negati : positi =      1.0 : 1.0
        contains(@bk_ii) = True           negati : positi =      1.0 : 1.0
contains(@coolcatteacher) = True           negati : positi =      1.0 : 1.0
    contains(assessment) = True           negati : positi =      1.0 : 1.0
     contains(malaysian) = True           negati : positi =      1.0 : 1.0
     contains(french""") = True           negati : positi =      1.0 : 1.0
     contains(definitly) = True           negati : positi =      1.0 : 1.0
    contains("""@tvorse) = True           negati : positi =      1.0 : 1.0
  contains(m&amp""""""") = True           negati : positi =      1.0 : 1.0
contains("""@lewisstanson) = True           negati : positi =      1.0 : 1.0
       contains(warm""") = True           negati : positi =      1.0 : 1.0
   contains(@chrishealy) = True           negati : positi =      1.0 : 1.0
        contains(@_dznr) = True           negati : positi =      1.0 : 1.0
  contains(@awesomekong) = True           negati : positi =      1.0 : 1.0
        contains(broken) = True           negati : positi =      1.0 : 1.0
          contains(get!) = True           negati : positi =      1.0 : 1.0
          contains(some) = True           negati : positi =      1.0 : 1.0
       contains(friends) = True           negati : positi =      1.0 : 1.0
       contains(ipod""") = True           negati : positi =      1.0 : 1.0
contains("""@jlsofficial) = True           negati : positi =      1.0 : 1.0
       contains(@dayngr) = True           negati : positi =      1.0 : 1.0
     contains("""headed) = True           negati : positi =      1.0 : 1.0
           contains(:-p) = True           negati : positi =      1.0 : 1.0
None
accuracy: 1.0

语料库:https://www.dropbox.com/s/rh2bykig7eh1zq6/Positivetweets50.txt?dl=0https://www.dropbox.com/s/bvy2libmen57n25/Negativetweets50.txt?dl=0

任何帮助将不胜感激。

1 个答案:

答案 0 :(得分:0)

尝试:

from nltk import NaiveBayesClassifier as nbc
from nltk.tokenize import word_tokenize
from itertools import chain

training_data = [('I love this sandwich.', 'pos'),
('This is an amazing place!', 'pos'),
('I feel very good about these beers.', 'pos'),
('This is my best work.', 'pos'),
("What an awesome view", 'pos'),
('I do not like this restaurant', 'neg'),
('I am tired of this stuff.', 'neg'),
("I can't deal with this", 'neg'),
('He is my sworn enemy!', 'neg'),
('My boss is horrible.', 'neg')]

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]

classifier = nbc.train(feature_set)

test_sentence = "This is the best band I've ever heard!"
featurized_test_sentence =  {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}

print "test_sent:",test_sentence
print "tag:",classifier.classify(featurized_test_sentence)