您好,我对python和编程完全陌生。我正在按照上面链接中的教程来查找文本文件中特定单词的搭配。我想知道在下面的脚本中创建一个collocations列表之前是否有办法从整个文本文件中删除停用词?
我尝试了以下代码,但它无效:
%ls *.txt
targetText = "Hume Treatise.txt"
with open(targetText, "r") as f:
theText = f.read()
print("This string has", "{:,}".format(len(theText)), "characters")
import re
theTokens = re.findall(r'\b\w[\w-]*\b', theText.lower())
print(theTokens[:10])
wrd2find = input("What word do you want collocates for?")
context = 5
end = len(theTokens)
counter = 0
theCollocates = []
for word in theTokens:
if word == wrd2find:
for i in range(context):
if (counter - (i + 1)) >= 0:
theCollocates.append(theTokens[(counter - (i + 1))])
if (counter + (i + 1)) < end:
theCollocates.append(theTokens[(counter + (i + 1))])
counter = counter + 1
print(theCollocates[:10])
import nltk
tokenDist = nltk.FreqDist(theCollocates)
tokenDist.tabulate(10)
from nltk.corpus import stopwords
filtered_words = [word for word in word_list if word not in
stopwords.words('english')]
import matplotlib
%matplotlib inline
tokenDist.plot(25, title="Top Frequency Collocates for " +
wrd2find.capitalize())