Ascii编解码器无法解码字节0xc2 python nltk

时间:2017-03-18 12:32:14

标签: python-2.7 ascii nltk stemming

我有一个我用于垃圾邮件分类的代码,它运行良好,但每当我尝试阻止/引理该词时,我都会收到此错误:

文件“/Users/Ramit/Desktop/Bayes1/src/filter.py”,第16行,在trim_word中     word = ps.stem(word)

文件“/Library/Python/2.7/site-packages/nltk/stem/porter.py”,第664行,干     stem = self._step1a(stem)

文件“/Library/Python/2.7/site-packages/nltk/stem/porter.py”,第289行,在_step1a

if word.endswith('ies') and len(word) == 4:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)

这是我的代码:

    from word import Word
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    class Filter():

def __init__(self):
    self.words = dict()


def trim_word(self, word):
    # Helper method to trim away some of the non-alphabetic characters
    # I deliberately do not remove all non-alphabetic characters.
    word = word.strip(' .:,-!()"?+<>*')
    word = word.lower()
            word = ps.stem(word)
    return word


def train(self, train_file):
    lineNumber = 1
    ham_words = 0
    spam_words = 0
            stop = set(stopwords.words('english'))

    # Loop through all the lines
    for line in train_file:
        if lineNumber % 2 != 0:
            line = line.split('\t')
            category = line[0]
            input_words = line[1].strip().split(' ')

            #Loop through all the words in the line, remove some characters
            for input_word in input_words:
                input_word = self.trim_word(input_word)
                if (input_word != "") and (input_word not in stop):

                    # Check if word is in dicionary, else add
                    if input_word in self.words:
                        word = self.words[input_word]
                    else:
                        word = Word(input_word)
                        self.words[input_word] = word

                    # Check wether the word is in ham or spam sentence, increment counters
                    if category == "ham":
                        word.increment_ham()
                        ham_words += 1
                    elif category == "spam":
                        word.increment_spam()
                        spam_words += 1

                    # Probably bad training file input...
                    else:
                        print "Not valid training file format"

        lineNumber+=1

    # Compute the probability for each word in the training set
    for word in self.words:
        self.words[word].compute_probability(ham_words, spam_words)


def get_interesting_words(self, sms):
    interesting_words = []
            stop = set(stopwords.words('english'))
    # Go through all words in the SMS and append to list. 
    # If we have not seen the word in training, assign probability of 0.4
    for input_word in sms.split(' '):
        input_word = self.trim_word(input_word)
        if (input_word != "") and (input_word not in stop):
            if input_word in self.words:
                word = self.words[input_word]
            else:
                word = Word(input_word)
                word.set_probability(0.40)
            interesting_words.append(word)

    # Sort the list of interesting words, return top 15 elements if list is longer than 15
    interesting_words.sort(key=lambda word: word.interesting(), reverse=True)
    return interesting_words[0:15]


def filter(self, input_file, result_file):
    # Loop through all SMSes and compute total spam probability of the sms-message
    lineNumber = 0
    for sms in input_file:
        lineNumber+=1
        spam_product = 1.0
        ham_product = 1.0
        if lineNumber % 2 != 0:
            try:
                for word in self.get_interesting_words(sms):
                    spam_product *= word.get_probability()
                    ham_product *= (1.0 - word.get_probability())

                sms_spam_probability = spam_product / (spam_product + ham_product)
            except:
                result_file.write("error")

            if sms_spam_probability > 0.8:
                result_file.write("SPAM: "+sms)
            else:
                result_file.write("HAM: "+sms)
        result_file.write("\n")

我只是在寻找一种解决方案,让我可以将这些词语弄为词汇。我试着环顾网络我发现了类似的问题,但他们并没有为我工作。

1 个答案:

答案 0 :(得分:0)

使用sys

import sys
sys.setdefaultencoding('utf-8')
reload(sys)