我在本地驱动器中有大量的txt文件,我想用它们标记并阻止它们的内容。如果您只有1个要处理的文档,那么脚本可以正常工作,并且大多数在线NLTK教程都没有解释如果需要在本地文件夹中循环txt文件(即如果您有文档语料库)那么将如何进行。我确信这是一个简单的循环问题,但我无法弄清楚如何阻止我的文件。
import nltk
import string
from collections import Counter
def get_tokens():
with open('/Users/myname/Desktop/text.txt', 'r') as shakes:
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(None, string.punctuation)
tokens = nltk.word_tokenize(no_punctuation)
return tokens
tokens = get_tokens()
count = Counter(tokens)
print count.most_common(10)
from nltk.corpus import stopwords
tokens = get_tokens()
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
print count.most_common(100)
from nltk.stem.porter import *
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
count = Counter(stemmed)
print count.most_common(100)