以下是代码:
import nltk
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
path = '/opt/datacourse/data/parts'
token_dict = {}
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
for subdir, dirs, files in os.walk(path):
for file in files:
file_path = subdir + os.path.sep + file
shakes = open(file_path, 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(None, string.punctuation)
token_dict[file] = no_punctuation
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())
跑完后,原来是:
File "D:\Python27\lib\site-packages\sklearn\feature_extraction\text.py", line 751, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
根据他人的说法'回复,我已经chencked text.py并确认min_def = 1
中的_init_
有谁能告诉我这是什么问题?非常感谢。