我正在尝试使用Python的Tfidf转换文本语料库。但是,当我尝试进行fit_transform转换时,出现以下错误ValueError: empty vocabulary; perhaps the documents only contain stop words.
DataFrame
,infoDF
看起来像这样:
Index Text
0 sob shortness of breath normal coronary artery admission for coronary angiogram
这是我要运行的代码:
class FeatureExtractor(TfidfVectorizer):
def __init__(self):
self.min_occur = 1
self.tfidf = TfidfVectorizer(ngram_range=(1, 3))
self.vocab = Counter()
#Learn a vocabulary dictionary of all tokens in the raw documents
def fit(self, dataFrame, y = None):
statements = pd.Series(dataFrame)
print(statements)
self.vocab = Counter()
for statement in statements:
self.vocab.update(statement)
tokens = [k for k,c in self.vocab.items() if c >= self.min_occur]
statements = statements.apply(lambda x: [w for w in x if w in tokens])
statements = statements.apply(lambda x: ' '.join(x))
statements = list(statements.values)
self.tfidf.fit(statements)
return self
def fit_transform(self, dataFrame, y = None):
self.fit(dataFrame)
return self.transform(self.dataFrame)
def transform(self, dataFrame):
statements = pd.Series(dataFrame)
tokens = [k for k,c in self.vocab.items() if c >= self.min_occur]
statements = statements.apply(lambda x: [w for w in x if w in tokens])
statements = list(statements.values)
features = self.tfidf.transform(statements)
return features
def main():
featuresExtractor = FeatureExtractor()
features= featuresExtractor.fit_transform(infoDF.loc[:,'Text'])
这是回溯:
Traceback (most recent call last):
File "<ipython-input-79-e4052f1aa786>", line 1, in <module>
features = featuresExtractor.fit_transform(infoDF.loc[:,'Text'])
File "<ipython-input-77-f968d76e1eb2>", line 26, in fit_transform
self.fit(dataFrame)
File "<ipython-input-77-f968d76e1eb2>", line 20, in fit
self.tfidf.fit(statements)
File "C:\Users\as\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1592, in fit
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
File "C:\Users\as\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1031, in fit_transform
self.fixed_vocabulary_)
File "C:\Users\as\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 962, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words