我想进行标记化,并且必须创建包含标记化单词的文件,而不使用停用词进行情感分析。我正在尝试使用代码,但它会出错。代码是:
import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
stopset = set(stopwords.words('english'))
with open('Grey.txt', 'r') as text_file,open('step3.txt','w') as outFile:
text = text_file.read()
tokens=word_tokenize(str(text))
tokens = [w for w in tokens if not w in stopset]
print(tokens)
outFile.write(str(tokens))
outFile.close()
,错误是:
(C:\Users\sama\Anaconda2) C:\Users\sama\Anaconda2\Amazon Project>python sw.py
Traceback (most recent call last):
File "sw.py", line 15, in <module>
tokens=word_tokenize(str(text))
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\__init__.py",
line 109, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\__init__.py",
line 94, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line
1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line
1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 311, in _pair_iter
for el in it:
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1291, in _slices_from_text
if self.text_contains_sentbreak(context):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1337, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 1472, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 310, in _pair_iter
prev = next(it)
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 577, in _annotate_first_pass
for aug_tok in tokens:
File "C:\Users\sama\Anaconda2\lib\site-packages\nltk\tokenize\punkt.py",
line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 12:
ordinal not in range(128)