使用Tweepy(python)收集更多推文

时间:2019-03-28 16:07:25

标签: python twitter tweepy

对于我的硕士论文,我必须刮擦推特消息并预测情绪。我应该每天有至少1条关于特定股票的推文。但是,当用Tweepy阅读推文时,我必须有许多空值。我如何获得更多独特的推文?

这是我抓取推文的代码:

def getTweet_topic(topic,amount):
        searchQuery = topic  # this is what we're searching for
        maxTweets = amount # Some arbitrary large number
        tweetsPerQry = 100  # this is the max the API permits


        # If results from a specific ID onwards are reqd, set since_id to that ID.
        # else default to no lower limit, go as far back as API allows


        # If results only below a specific ID are, set max_id to that ID.
        # else default to no upper limit, start from the most recent tweet matching the search query.
        max_id = float(api.user_timeline(id = api.me().id, count=1)[0].id_str)
        sinceId = max_id - maxTweets
        tweetCount = 0
        language = "en"
        df = pd.DataFrame(columns=['Tweets','Date','Likes','RTs','Sentiment','User','Followers'])
        dfh = dataframeHandler(df)
        twh = twitterHandler()

        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if(not sinceId):
                        print("yes")
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry,lang=language)
                    else:
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry,since_id=sinceId,lang=language)
                else:
                    if(not sinceId):
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry,max_id=str(max_id - 1),lang=language)
                    else:
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry,max_id=str(max_id - 1),since_id=sinceId,lang=language)
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    #if tweet.text.len() > 15:
                    tweet_cleaner(tweet.text)
                    if 'RT @' not in tweet.text:
                        df.loc[len(df)] = [tweet.text, tweet.author.created_at, tweet.favorite_count, tweet.retweet_count,twh.get_tweet_sentiment(tweet.text), twh.get_user(tweet.author),twh.get_user_followers(tweet.user)]
                        tweetCount += len(new_tweets)
                        max_id = new_tweets[-1].id
            except tweepy.TweepError as e:
                # Just exit if any error
                print("some error : " + str(e))
                break
        return df
    #read all tweets from the infl_users for the specific topic

df = df.append(getTweet_topic(topic, 100000),ignore_index=True)

0 个答案:

没有答案