对于我的硕士论文,我必须刮擦推特消息并预测情绪。我应该每天有至少1条关于特定股票的推文。但是,当用Tweepy阅读推文时,我必须有许多空值。我如何获得更多独特的推文?
这是我抓取推文的代码:
def getTweet_topic(topic,amount):
searchQuery = topic # this is what we're searching for
maxTweets = amount # Some arbitrary large number
tweetsPerQry = 100 # this is the max the API permits
# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
max_id = float(api.user_timeline(id = api.me().id, count=1)[0].id_str)
sinceId = max_id - maxTweets
tweetCount = 0
language = "en"
df = pd.DataFrame(columns=['Tweets','Date','Likes','RTs','Sentiment','User','Followers'])
dfh = dataframeHandler(df)
twh = twitterHandler()
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if(not sinceId):
print("yes")
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,lang=language)
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,since_id=sinceId,lang=language)
else:
if(not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,max_id=str(max_id - 1),lang=language)
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,max_id=str(max_id - 1),since_id=sinceId,lang=language)
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
#if tweet.text.len() > 15:
tweet_cleaner(tweet.text)
if 'RT @' not in tweet.text:
df.loc[len(df)] = [tweet.text, tweet.author.created_at, tweet.favorite_count, tweet.retweet_count,twh.get_tweet_sentiment(tweet.text), twh.get_user(tweet.author),twh.get_user_followers(tweet.user)]
tweetCount += len(new_tweets)
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
# Just exit if any error
print("some error : " + str(e))
break
return df
#read all tweets from the infl_users for the specific topic
df = df.append(getTweet_topic(topic, 100000),ignore_index=True)