nltk预处理"停止单词/网址"在代码中编写错误的程序

时间:2016-03-28 00:59:41

标签: python nltk preprocessor stop-words

我正在为Twitter做一个情感分析,程序差不多完成但是我在预处理方面遇到了一些问题,我试图过滤给出的文件,首先删除"停用词&#34 ;然后从短语中删除标签和网址。有人可以告诉我什么关注停止词的过滤和标签/网址是否正确它是否真的删除和过滤任何给定的短语这是我的整个程序:

import nltk
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

short_pos = open("short_reviews/positive.txt","r").read()
short_neg = open("short_reviews/negative.txt","r").read()

all_words = []
documents = []


allowed_word_types = ["J"]

for p in short_pos.split('\n'):
    documents.append( (p, "pos") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())


for p in short_neg.split('\n'):
    documents.append( (p, "neg") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

stop_words = set(stopwords.words("english"))
filtered_sentence = []
for w in all_words:
    if w not in stop_words:
        filtered_sentence.append(w)
print (filtered_sentence)

filtered_documents = open("pickled_algos/fil_documents.pickle","wb")
pickle.dump(filtered_sentence, filtered_documents)
filtered_documents.close()

#remove the words that start with #
filter(lambda x:x[0]!='#', documents.split())

#remove URL
filter(lambda x:x[0]!='https://www.', documents.split())

save_documents = open("pickled_algos/documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()

all_words = nltk.FreqDist(all_words)


word_features = list(all_words.keys())[:5000]


save_word_features = open("pickled_algos/word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()


def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

save_featuresets = open("pickled_algos/featuresets.pickle", "wb")
pickle.dump(featuresets, save_featuresets)
save_featuresets.close()

random.shuffle(featuresets)

testing_set = featuresets[10000:]
training_set = featuresets[:10000]


classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)


def realsenti(text):
    feats = find_features(text)

    return voted_classifier.classify(feats),text

这就是我希望看到它真正做好工作的部分

stop_words = set(stopwords.words("english"))
filtered_sentence = []
for w in all_words:
    if w not in stop_words:
        filtered_sentence.append(w)
print (filtered_sentence)

filtered_documents = open("pickled_algos/fil_documents.pickle","wb")
pickle.dump(filtered_sentence, filtered_documents)
filtered_documents.close()

#remove the words that start with #
filter(lambda x:x[0]!='#', documents.split())

#remove URL
filter(lambda x:x[0]!='https://www.', documents.split())

因为我每次都会收到此错误

filter(lambda x:x[0]!='#', documents.split())
AttributeError: 'list' object has no attribute 'split'

我希望所有内容都能澄清,我希望有人能提前告诉我更多的错误。

0 个答案:

没有答案