csv文件错误和停用词原生贝叶斯

时间:2016-03-03 21:32:34

标签: python-2.7 csv nltk

你好,有人可以帮我这个代码吗?这是终端错误 文件“bayes1.py”,第74行,in     print classifier.classify(extract_features(tweet.split())) AttributeError:'list'对象没有属性'split'

你可以创建一个删除.txt文件单词stop的函数吗? 非常感谢

enter code here 
                import nltk 
                import csv
                from nltk.classify.naivebayes import NaiveBayesClassifier
                import sys
                import codecs


                twitterData = sys.argv[1] # tweet input file (.csv)

                def tweet_dict(twitterData):  
                    ''' (file) -> list of dictionaries
                    This method should take your .csv
                    file and create a list of dictionaries.
                    '''
                    twitter_list_dict = []    
                    twitterfile = open(twitterData)
                    twitterreader = csv.reader(twitterfile)
                    for line in twitterreader:
                        twitter_list_dict.append(line[0])
                    return twitter_list_dict


                def get_words_in_tweets(tweets):
                    all_words = []
                    for (words, sentiment) in tweets:
                        all_words.extend(words)
                    return all_words

                def get_word_features(wordlist):
                    wordlist = nltk.FreqDist(wordlist)
                    word_features = wordlist.keys()
                    return word_features

                def read_tweets(fname, t_type):
                    tweets = []
                    f = open(fname, 'r')
                    line = f.readline()
                    while line != '':
                        tweets.append([line, t_type])
                        line = f.readline()
                    f.close()
                    return tweets

                def extract_features(document):
                    document_words = set(document)
                    features = {}
                    for word in word_features:
                      features['contains(%s)' % word] = (word in document_words)
                    return features

                # read in postive and negative training tweets
                pos_tweets = read_tweets('amazon_positive.txt', 'positive')
                neg_tweets = read_tweets('amazon_negative.txt', 'negative')    

                tweets = []
                for (words, sentiment) in pos_tweets + neg_tweets:
                    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
                    tweets.append((words_filtered, sentiment))


                # extract the word features out from the training data
                word_features = get_word_features(\
                get_words_in_tweets(tweets))

                training_set = nltk.classify.util.apply_features(extract_features, tweets)

                classifier = NaiveBayesClassifier.train(training_set)
                print classifier.show_most_informative_features()
                #show_most_informative_features

                tweet = tweet_dict(twitterData)
                print classifier.classify(extract_features(tweet.split()))

0 个答案:

没有答案