我有一个我用于垃圾邮件分类的代码,它运行良好,但每当我尝试阻止/引理该词时,我都会收到此错误:
文件“/Users/Ramit/Desktop/Bayes1/src/filter.py”,第16行,在trim_word中 word = ps.stem(word)
文件“/Library/Python/2.7/site-packages/nltk/stem/porter.py”,第664行,干 stem = self._step1a(stem)
文件“/Library/Python/2.7/site-packages/nltk/stem/porter.py”,第289行,在_step1a
if word.endswith('ies') and len(word) == 4:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
这是我的代码:
from word import Word
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
class Filter():
def __init__(self):
self.words = dict()
def trim_word(self, word):
# Helper method to trim away some of the non-alphabetic characters
# I deliberately do not remove all non-alphabetic characters.
word = word.strip(' .:,-!()"?+<>*')
word = word.lower()
word = ps.stem(word)
return word
def train(self, train_file):
lineNumber = 1
ham_words = 0
spam_words = 0
stop = set(stopwords.words('english'))
# Loop through all the lines
for line in train_file:
if lineNumber % 2 != 0:
line = line.split('\t')
category = line[0]
input_words = line[1].strip().split(' ')
#Loop through all the words in the line, remove some characters
for input_word in input_words:
input_word = self.trim_word(input_word)
if (input_word != "") and (input_word not in stop):
# Check if word is in dicionary, else add
if input_word in self.words:
word = self.words[input_word]
else:
word = Word(input_word)
self.words[input_word] = word
# Check wether the word is in ham or spam sentence, increment counters
if category == "ham":
word.increment_ham()
ham_words += 1
elif category == "spam":
word.increment_spam()
spam_words += 1
# Probably bad training file input...
else:
print "Not valid training file format"
lineNumber+=1
# Compute the probability for each word in the training set
for word in self.words:
self.words[word].compute_probability(ham_words, spam_words)
def get_interesting_words(self, sms):
interesting_words = []
stop = set(stopwords.words('english'))
# Go through all words in the SMS and append to list.
# If we have not seen the word in training, assign probability of 0.4
for input_word in sms.split(' '):
input_word = self.trim_word(input_word)
if (input_word != "") and (input_word not in stop):
if input_word in self.words:
word = self.words[input_word]
else:
word = Word(input_word)
word.set_probability(0.40)
interesting_words.append(word)
# Sort the list of interesting words, return top 15 elements if list is longer than 15
interesting_words.sort(key=lambda word: word.interesting(), reverse=True)
return interesting_words[0:15]
def filter(self, input_file, result_file):
# Loop through all SMSes and compute total spam probability of the sms-message
lineNumber = 0
for sms in input_file:
lineNumber+=1
spam_product = 1.0
ham_product = 1.0
if lineNumber % 2 != 0:
try:
for word in self.get_interesting_words(sms):
spam_product *= word.get_probability()
ham_product *= (1.0 - word.get_probability())
sms_spam_probability = spam_product / (spam_product + ham_product)
except:
result_file.write("error")
if sms_spam_probability > 0.8:
result_file.write("SPAM: "+sms)
else:
result_file.write("HAM: "+sms)
result_file.write("\n")
我只是在寻找一种解决方案,让我可以将这些词语弄为词汇。我试着环顾网络我发现了类似的问题,但他们并没有为我工作。
答案 0 :(得分:0)
使用sys
。
import sys
sys.setdefaultencoding('utf-8')
reload(sys)