我正在尝试使用Tweepy和python获取推特数据。当使用tweepy流式传输时,tweet的速度非常慢。还有其他库或方法可以更快地做到吗?我现在可以在5分钟内阅读20条推文。 我的代码:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# Return API with authentication:
api = tweepy.API(auth, wait_on_rate_limit=True)
#vul variabelen in:
topic = "GOOGL"
language = "en"
df = pd.DataFrame(columns=['Tweets','Date','Likes','RTs','Sentiment','User','Followers'])
dfh = dataframeHandler(df)
twh = twitterHandler()
def preprocess2(raw_text):
stopword_set = set(stopwords.words("english"))
return " ".join([i for i in re.sub(r"http\S+|@\S+", "", raw_text).lower().split() if i not in stopword_set])
class CustomStreamListener(tweepy.StreamListener):
def __init__(self):
super(CustomStreamListener, self).__init__()
self.num_tweets = 0
def on_status(self, status):
self.num_tweets += 1
if self.num_tweets < 25:
print("in")
if not status.retweeted and 'RT @' not in status.text:
print(status.text)
status.text = preprocess2(status.text)
#add tweet to df
df.loc[len(df)] = [status.text,status.author.created_at,status.favorite_count,status.retweet_count,twh.get_tweet_sentiment(status.text),twh.get_user(status.author),twh.get_user_followers(status.user)]
return True
else:
return False
def on_error(self, status_code):
return True # Don't kill the stream
def on_timeout(self):
print("timeout")
return True # Don't kill the stream
myStreamListener = CustomStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=CustomStreamListener())
myStream.filter(track=['AAPL'])
dfh.df.to_csv(topic + ".csv")