如果有人不是唐纳德·特朗普以外的人,我是否有任何明确的原因能够将推文拉到csv文件中,但是当我尝试将其用于realDonaldTrump的screen_name时,它最多只能返回200条推文? >
在这里您可以找到代码。 (例如,与JLo一起使用,效果很好...)
def get_all_tweets(screen_name):
# Twitter only allows access to a users most recent 3240 tweets with this method
# authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
# initialize a list to hold all the tweepy Tweets
alltweets = []
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended')
# save most recent tweets
alltweets.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print
"getting tweets before %s" % (oldest)
# all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest, tweet_mode='extended')
# save most recent tweets
alltweets.extend(new_tweets)
# update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print
"...%s tweets downloaded so far" % (len(alltweets))
# transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.full_text.replace("\n","")] for tweet in alltweets]
# write the csv
with open('%s_tweets.csv' % screen_name, 'w') as f:
writer = csv.writer(f)
writer.writerow(["id", "created_at", "full_text"])
writer.writerows(outtweets)
pass
if __name__ == '__main__':
# pass in the username of the account you want to download
get_all_tweets("realDonaldTrump")
答案 0 :(得分:0)
您的实施工作;但是,您的Twitter API应用程序可能受到限制。了解Twitter API中的rate limiting。
通常,当您从这样的第三方API抓取数据时,您希望将结果持久保存在可靠的位置(在您的情况下,保存在文件系统上的CSV文件中,这可能对您的问题范围有好处)您可以再次查询API,以检索您在以前的查询中可能未收到的任何数据。
我将在下面提供一个简单的示例,说明您可以重新设计应用程序以这种方式运行的一种方式。简而言之,下面的示例将在每次执行时修改任何先前存在的CSV,以使其结果比CSV中的最新已知记录新,并追加比CSV中的最旧已知记录早的结果。
下面的示例还演示了如何使用Tweepy的cursors遍历分页的tweet数据。
import csv
from dataclasses import dataclass
from pathlib import Path
import logging
import os
import sys
import tweepy
@dataclass
class TweetIdRange:
count: int
newest_id: int
oldest_id: int
def get_tweets_file_id_range(tweets_file_name: Path) -> TweetIdRange:
id_range = TweetIdRange(count=0, newest_id=None, oldest_id=None)
with open(tweets_file_name) as tweets_file:
reader = csv.reader(tweets_file)
try:
next(reader)
except StopIteration:
raise RuntimeError(f'Tweets file ({tweets_file_name}) does not contain any rows; '
f'expected at least one header row')
try:
row = next(reader)
row_id = int(row[0])
id_range.newest_id = row_id
id_range.oldest_id = row_id
id_range.count += 1
except StopIteration:
pass
else:
for row in reader:
id_range.count += 1
row_id = int(row[0])
if row_id < id_range.oldest_id:
id_range.oldest_id = row_id
if row_id > id_range.newest_id:
id_range.newest_id = row_id
return id_range
def write_tweets(tweets_file_name: Path, screen_name: str, since_id: int = None, max_id: int = None):
user_timeline_options = {
'count': 100,
'tweet_mode': 'extended',
'screen_name': screen_name,
'since_id': since_id, # >
'max_id': max_id # <=
}
tweet_count = 0
with open(tweets_file_name, 'w') as tweets_file:
writer = csv.writer(tweets_file)
for page in tweepy.Cursor(api.user_timeline, **user_timeline_options).pages():
tweets = [[tweet.id_str, tweet.created_at, tweet.full_text.replace('\n', '')] for tweet in page]
tweet_count += len(tweets)
writer.writerows(tweets)
return tweet_count
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
screen_name = sys.argv[1] if len(sys.argv) > 1 else None
if not screen_name:
print('error: missing required screen name positional argument', file=sys.stderr)
sys.exit(1)
tweets_file_name = Path(f'{screen_name}_tweets.csv')
if not tweets_file_name.exists():
logging.info('%s: initializing new file', tweets_file_name)
with open(tweets_file_name, 'w') as existing_tweets_file:
writer = csv.writer(existing_tweets_file)
writer.writerow(['id', 'created_at', 'full_text'])
tweets_id_range = get_tweets_file_id_range(tweets_file_name)
logging.info('%s: tweet ID range (before): count: %s; newest: %s; oldest: %s',
tweets_file_name, tweets_id_range.count, tweets_id_range.newest_id, tweets_id_range.oldest_id)
newest_tweets_file_name = tweets_file_name.with_suffix('.newest' + ''.join(tweets_file_name.suffixes))
new_tweet_count = write_tweets(newest_tweets_file_name, screen_name, since_id=tweets_id_range.newest_id)
logging.info('%s: prepending %s new tweets', tweets_file_name, new_tweet_count)
oldest_tweets_file_name = tweets_file_name.with_suffix('.oldest' + ''.join(tweets_file_name.suffixes))
if tweets_id_range.oldest_id:
old_tweet_count = write_tweets(oldest_tweets_file_name, screen_name, max_id=tweets_id_range.oldest_id - 1)
logging.info('%s: appending %s old tweets', tweets_file_name, old_tweet_count)
swap_tweets_file_name = tweets_file_name.with_suffix('.swap' + ''.join(tweets_file_name.suffixes))
with open(swap_tweets_file_name, 'w') as swap_file, \
open(tweets_file_name, 'r') as existing_file:
swap_file.write(existing_file.readline())
with open(newest_tweets_file_name, 'r') as newest_tweets_file:
for line in newest_tweets_file:
swap_file.write(line)
for line in existing_file:
swap_file.write(line)
if oldest_tweets_file_name.exists():
with open(oldest_tweets_file_name, 'r') as oldest_tweets_file:
for line in oldest_tweets_file:
swap_file.write(line)
swap_tweets_file_name.rename(tweets_file_name)
os.remove(newest_tweets_file_name)
if oldest_tweets_file_name.exists():
os.remove(oldest_tweets_file_name)
tweets_id_range = get_tweets_file_id_range(tweets_file_name)
logging.info('%s: tweet ID range (after): count: %s; newest: %s; oldest: %s',
tweets_file_name, tweets_id_range.count, tweets_id_range.newest_id, tweets_id_range.oldest_id)
您会注意到,连续运行此操作将返回更多数据,直到您到达Twitter API所能提供的末尾为止,例如
▶ python test.py realDonaldTrump
INFO:root:realDonaldTrump_tweets.csv: tweet ID range (before): count: 350; newest: 1239685852093169664; oldest: 1235005879226961924
INFO:root:realDonaldTrump_tweets.csv: prepending 0 new tweets
INFO:root:realDonaldTrump_tweets.csv: appending 1799 old tweets
INFO:root:realDonaldTrump_tweets.csv: tweet ID range (after): count: 2149; newest: 1239685852093169664; oldest: 1214517113437720576
▶ python test.py realDonaldTrump
INFO:root:realDonaldTrump_tweets.csv: tweet ID range (before): count: 2149; newest: 1239685852093169664; oldest: 1214517113437720576
INFO:root:realDonaldTrump_tweets.csv: prepending 0 new tweets
INFO:root:realDonaldTrump_tweets.csv: appending 1045 old tweets
INFO:root:realDonaldTrump_tweets.csv: tweet ID range (after): count: 3194; newest: 1239685852093169664; oldest: 1203103574781317121
▶ python test.py realDonaldTrump
INFO:root:realDonaldTrump_tweets.csv: tweet ID range (before): count: 3194; newest: 1239685852093169664; oldest: 1203103574781317121
INFO:root:realDonaldTrump_tweets.csv: prepending 0 new tweets
INFO:root:realDonaldTrump_tweets.csv: appending 0 old tweets
INFO:root:realDonaldTrump_tweets.csv: tweet ID range (after): count: 3194; newest: 1239685852093169664; oldest: 1203103574781317121
此示例并非旨在提供可用于生产的实施;但是,它应该为您提供一些思考的新点,并且应该使您超越问题中概述的问题。