在对实时获取的推文进行形态分析后,尝试对tf-idf得分为0.03或更高的名词进行排序时,发生此错误。 另外,我无法删除收到的推文中的转发和表情。
您能告诉我代码中发生了什么以及如何解决吗?
File "final.py", line 97, in <module>
stream.sample()
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 449, in sample
self._start(is_async)
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 389, in _start
self._run()
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 320, in _run
six.reraise(*exc_info)
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/six.py", line 693, in reraise
raise value
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 289, in _run
self._read_loop(resp)
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 351, in _read_loop
self._data(next_status_obj)
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 323, in _data
if self.listener.on_data(data) is False:
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 54, in on_data
if self.on_status(status) is False:
File "final.py", line 78, in on_status
tfidf = vectorizer.fit_transform(corpus)
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1652, in fit_transform
X = super().fit_transform(raw_documents)
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1058, in fit_transform
self.fixed_vocabulary_)
File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 989, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
import os
import tweepy
import redis
import math
from collections import Counter
import re
from natto import MeCab
import codecs
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
import numpy as np
#r = redis.Redis(host='localhost', port=6379, db=0)
TWITTER_CLIENT_ID = os.environ['TWITTER_CLIENT_ID']
TWITTER_CLIENT_SECRET = os.environ['TWITTER_CLIENT_SECRET']
TWITTER_OAUTH_TOKEN = os.environ['TWITTER_OAUTH_TOKEN']
TWITTER_OAUTH_TOKEN_SECRET = os.environ['TWITTER_OAUTH_TOKEN_SECRET']
auth = tweepy.OAuthHandler(TWITTER_CLIENT_ID,TWITTER_CLIENT_SECRET)
auth.set_access_token(TWITTER_OAUTH_TOKEN,TWITTER_OAUTH_TOKEN_SECRET)
class StreamListener(tweepy.StreamListener):
def __init__(self):
super().__init__()
self.count = 0 # Number of tweets acquired
def on_status(self, status):
text = str(status.text)
text2 = re.sub(r"http\S+", "", text)
text3 = re.sub(r"@(\w+) ", "", text2)
text4 = re.sub(r"#(\w+)", "", text3)
text5 = re.sub(r"RT(\w+)", "", text4) #Unable to erase retweet
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
"]+", flags=re.UNICODE)
text6 = emoji_pattern.sub("", text5) #Unable to erase Emoji
#Writing Japanese tweets to a file + Displaying the number of tweets
if status.lang == "ja":
self.count += 1
print(self.count, text6)
with open("test37.txt", "a", encoding="utf-8") as f:
f.write(text6)
with codecs.open("test37.txt", "r", "utf-8") as f:
corpus = f.read().split("\n")
mecab = MeCab('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
#if tagger.lang == 'ja':
rm_list = ["RT","https","co","@","__"]
docs = []
for txt in corpus:
words = mecab.parse(txt, as_nodes=True)
doc = []
#Morphological analysis using MeCab
for w in words:
if w.feature.split(",")[0] == "名詞": #名詞 = noun
if len(w.surface) >= 3:
if not any(rm in w.surface for rm in rm_list):
doc.append(str(w.surface))
doc = ' '.join(doc)
docs.append(doc)
corpus = docs
#tf-idf calculation
vectorizer = TfidfVectorizer(min_df=0.03)
tfidf = vectorizer.fit_transform(corpus)
#Sort words by score
feature_names = np.array(vectorizer.get_feature_names())
for vec in tfidf:
index = np.argsort(vec.toarray(), axis=1)[:,::-1]
feature_words = feature_names[index]
#print(corpus)
print(feature_words[:,:10])
def on_error(self, status_code):
return False
stream = tweepy.Stream(auth=auth, listener=StreamListener())
stream.sample()
iOS 10.12.6,Python 3.7.3,Atom