Question

我正在尝试使用Python的Tfidf转换文本语料库。但是，当我尝试进行fit_transform转换时，出现以下错误ValueError: empty vocabulary; perhaps the documents only contain stop words. DataFrame，infoDF看起来像这样：

Index     Text
0         sob shortness of breath normal coronary artery admission for coronary angiogram

这是我要运行的代码：

class FeatureExtractor(TfidfVectorizer):
    def __init__(self):
        self.min_occur = 1
        self.tfidf = TfidfVectorizer(ngram_range=(1, 3))     
        self.vocab = Counter()

    #Learn a vocabulary dictionary of all tokens in the raw documents
    def fit(self, dataFrame, y = None):
        statements = pd.Series(dataFrame)
        print(statements)
        self.vocab = Counter()
        for statement in statements:
            self.vocab.update(statement)
        tokens = [k for k,c in self.vocab.items() if c >= self.min_occur]
        statements = statements.apply(lambda x: [w for w in x if w in tokens])
        statements = statements.apply(lambda x: ' '.join(x))
        statements = list(statements.values)
        self.tfidf.fit(statements)
        return self

    def fit_transform(self, dataFrame, y = None):
        self.fit(dataFrame)
        return self.transform(self.dataFrame)

    def transform(self, dataFrame):
        statements = pd.Series(dataFrame)
        tokens = [k for k,c in self.vocab.items() if c >= self.min_occur]

        statements = statements.apply(lambda x: [w for w in x if w in tokens])
        statements = list(statements.values)
        features = self.tfidf.transform(statements)
        return features
def main():
    featuresExtractor = FeatureExtractor()
    features= featuresExtractor.fit_transform(infoDF.loc[:,'Text'])

这是回溯：

Traceback (most recent call last):

  File "<ipython-input-79-e4052f1aa786>", line 1, in <module>
    features = featuresExtractor.fit_transform(infoDF.loc[:,'Text'])

  File "<ipython-input-77-f968d76e1eb2>", line 26, in fit_transform
    self.fit(dataFrame)

  File "<ipython-input-77-f968d76e1eb2>", line 20, in fit
    self.tfidf.fit(statements)

  File "C:\Users\as\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1592, in fit
    X = super(TfidfVectorizer, self).fit_transform(raw_documents)

  File "C:\Users\as\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1031, in fit_transform
    self.fixed_vocabulary_)

  File "C:\Users\as\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 962, in _count_vocab
    raise ValueError("empty vocabulary; perhaps the documents only"

ValueError: empty vocabulary; perhaps the documents only contain stop words

TfidfVectorizer抛出ValueError：空词汇；也许文件只包含停用词

0 个答案: