python - 如何解决“ ValueError：空词汇〜”？ - Thinbug

如何解决“ ValueError：空词汇〜”？

时间：2019-08-16 09:01:47

标签： python tf-idf

在对实时获取的推文进行形态分析后，尝试对tf-idf得分为0.03或更高的名词进行排序时，发生此错误。另外，我无法删除收到的推文中的转发和表情。

您能告诉我代码中发生了什么以及如何解决吗？

错误

  File "final.py", line 97, in <module>
    stream.sample()
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 449, in sample
    self._start(is_async)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 389, in _start
    self._run()
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 320, in _run
    six.reraise(*exc_info)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/six.py", line 693, in reraise
    raise value
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 289, in _run
    self._read_loop(resp)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 351, in _read_loop
    self._data(next_status_obj)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 323, in _data
    if self.listener.on_data(data) is False:
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 54, in on_data
    if self.on_status(status) is False:
  File "final.py", line 78, in on_status
    tfidf = vectorizer.fit_transform(corpus)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1652, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1058, in fit_transform
    self.fixed_vocabulary_)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 989, in _count_vocab
    raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words

代码

import os
import tweepy
import redis
import math
from collections import Counter
import re
from natto import MeCab
import codecs
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
import numpy as np

#r = redis.Redis(host='localhost', port=6379, db=0)

TWITTER_CLIENT_ID = os.environ['TWITTER_CLIENT_ID']
TWITTER_CLIENT_SECRET = os.environ['TWITTER_CLIENT_SECRET']

TWITTER_OAUTH_TOKEN = os.environ['TWITTER_OAUTH_TOKEN']
TWITTER_OAUTH_TOKEN_SECRET = os.environ['TWITTER_OAUTH_TOKEN_SECRET']

auth = tweepy.OAuthHandler(TWITTER_CLIENT_ID,TWITTER_CLIENT_SECRET)
auth.set_access_token(TWITTER_OAUTH_TOKEN,TWITTER_OAUTH_TOKEN_SECRET)

class StreamListener(tweepy.StreamListener):
    def __init__(self):
        super().__init__()
        self.count = 0 # Number of tweets acquired

    def on_status(self, status):
        text = str(status.text)
        text2 = re.sub(r"http\S+", "", text)
        text3 = re.sub(r"@(\w+) ", "", text2)
        text4 = re.sub(r"#(\w+)", "", text3)
        text5 = re.sub(r"RT(\w+)", "", text4) #Unable to erase retweet
        emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        "]+", flags=re.UNICODE)
        text6 = emoji_pattern.sub("", text5) #Unable to erase Emoji

        #Writing Japanese tweets to a file + Displaying the number of tweets
        if status.lang == "ja":
            self.count += 1
            print(self.count, text6)
            with open("test37.txt", "a", encoding="utf-8") as f:
                f.write(text6)
            with codecs.open("test37.txt", "r", "utf-8") as f:
                corpus = f.read().split("\n")

            mecab = MeCab('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')

            #if tagger.lang == 'ja':

            rm_list = ["RT","https","co","@","__"]

            docs = []
            for txt in corpus:
                words = mecab.parse(txt, as_nodes=True)
                doc = []
        #Morphological analysis using MeCab
                for w in words:
                    if w.feature.split(",")[0] == "名詞": #名詞 = noun
                        if len(w.surface) >= 3:
                            if not any(rm in w.surface for rm in rm_list):
                                doc.append(str(w.surface))

                doc = ' '.join(doc)
                docs.append(doc)
            corpus = docs

        #tf-idf calculation
            vectorizer = TfidfVectorizer(min_df=0.03)
            tfidf = vectorizer.fit_transform(corpus)

            #Sort words by score
            feature_names = np.array(vectorizer.get_feature_names())
            for vec in tfidf:
                index = np.argsort(vec.toarray(), axis=1)[:,::-1]
                feature_words = feature_names[index]
                #print(corpus)
                print(feature_words[:,:10])

    def on_error(self, status_code):
        return False

stream = tweepy.Stream(auth=auth, listener=StreamListener())
stream.sample()

其他信息

iOS 10.12.6，Python 3.7.3，Atom

0 个答案:

没有答案