from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import stopwords
corpus_root = 'fradulentemail'
newcorpus = PlaintextCorpusReader(corpus_root,'.*')
sentence = newcorpus.sents()[:50]
all_words = [[word.lower() for word in text] for text in sentence]
for stopword in nltk.corpus.stopwords.words('english'):
if stopword in all_words:
del all_words[stopword]
此外,执行newcorpus.raw()给我带来错误也给我带来了错误,因为“'utf-8'编解码器无法解码位置118396处的字节0x80:无效的起始字节”。请帮助
答案 0 :(得分:1)
如果仔细查看all_words = [[word.lower() for word in text] for text in sentence]
,您会发现all_words
是一个列表列表。将代码更改为如下所示,它应该可以正常工作:
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
corpus_root = 'fradulentemail'
newcorpus = PlaintextCorpusReader(corpus_root,'.*')
sentences = newcorpus.sents()[:50]
all_non_stop_words = [[word.lower() for word in text if word.lower() not in stop_words] for text in sentences]
答案 1 :(得分:0)
在
del all_words[stopword]
您已经在all_word中使用了停用词的索引,而不是停用词本身 试试这个
stopwords=nltk.corpus.stopwords.words('english')
all_words=[ word for word in all_words if word not in stopwords]
print(all_words)