我正在使用此处的代码来抓取一些用户的推文,并以.csv格式导出:https://towardsdatascience.com/tweepy-for-beginners-24baf21f2c25
理想情况下,我希望获得每个用户的所有推文,但似乎仅限于最新的3200条推文。这是我以王牌为例使用的确切代码:
ids = ['realDonaldTrump']
def extract_hashtags(hashtag_list):
final_hashtag = ''
for hashtag in hashtag_list:
final_hashtag = final_hashtag + ' ' + hashtag['text']
return final_hashtag.strip()
#from https://towardsdatascience.com/tweepy-for-beginners-24baf21f2c25
class TweetMiner(object):
result_limit = 20
data = []
api = False
twitter_keys = { #redacted }
def __init__(self, keys_dict=twitter_keys, api=api, result_limit = 20):
self.twitter_keys = keys_dict
auth = tw.OAuthHandler(keys_dict['consumer_key'], keys_dict['consumer_secret'])
auth.set_access_token(keys_dict['access_token_key'], keys_dict['access_token_secret'])
self.api = tw.API(auth)
self.twitter_keys = keys_dict
self.result_limit = result_limit
def mine_user_tweets(self, user,
mine_rewteets=False,
max_pages=5):
data = []
last_tweet_id = False
page = 1
while page <= max_pages:
if last_tweet_id:
statuses = self.api.user_timeline(screen_name=user,
count=self.result_limit,
max_id=last_tweet_id - 1,
tweet_mode = 'extended',
include_retweets=True
)
else:
statuses = self.api.user_timeline(screen_name=user,
count=self.result_limit,
tweet_mode = 'extended',
include_retweets=True)
for item in statuses:
mined = {
'tweet_id': item.id,
'name': item.user.name,
'screen_name': item.user.screen_name,
'retweet_count': item.retweet_count,
'text': item.full_text,
'mined_at': datetime.datetime.now(),
'created_at': item.created_at,
#'time_zone': item._json['time_zone'],
'favourite_count': item.favorite_count,
'hashtags': extract_hashtags(item.entities['hashtags']),
#'links': extract_
'status_count': item.user.statuses_count,
'location': item.place,
'source_device': item.source
}
try:
mined['retweet_text'] = item.retweeted_status.full_text
except:
mined['retweet_text'] = 'None'
try:
mined['quote_text'] = item.quoted_status.full_text
mined['quote_screen_name'] = status.quoted_status.user.screen_name
except:
mined['quote_text'] = 'None'
mined['quote_screen_name'] = 'None'
last_tweet_id = item.id
data.append(mined)
page += 1
return data
#result_limit * max_pages is the no of tweets for each id
miner=TweetMiner(result_limit = 460) #200
counter = 0
counter2 = 0
for id in ids:
try:
print("Fetching tweets of " + id+ " now...")
mined_tweets = miner.mine_user_tweets(user= id, max_pages=460) #100
mined_tweets_df= pd.DataFrame(mined_tweets)
counter2 = counter2 +1
#after 40 tries, pause for 15 mins
if counter2%40==0: #5
print("Couldn't fetch, sleeping for 15 mins")
time.sleep(900) #15 minute sleep time
except:
print(id, 'is invalid or locked')
if counter>0:
final_df = pd.concat([final_df, mined_tweets_df], ignore_index = True)
print("Fetched and added!")
else:
final_df = mined_tweets_df
print("Fetched and added!")
counter +=1
print(final_df)
final_df.to_csv('tweets.csv', encoding='UTF-8')
每个id中的用户返回的tweets数量应为460 * 460 = 211,600条tweets,但每个id仅返回3200条tweets。这个限制是API内置的严格限制吗?如果是,是否有任何方法可以使每个用户获得3200条以上的推文?
答案 0 :(得分:0)
这是内置在Twitter API中的限制。 user timeline最多只能返回3200条推文(每“页面”中200条推文)。要检索更多信息,您将需要使用高级或企业完整档案搜索API。