很久以前,我进入Python是因为我想创建一个自己的网络爬虫机器人,该机器人可以从某个subreddit上获取帖子并将其上传到Twitter,但是不幸的是,我遇到了一些问题。
该机器人最多只能下载X个帖子
for submission in subreddit_info.hot(limit=X):
在下载了所说的帖子并鸣叫它们之后,脚本停止了,我需要再次运行它。再次运行bot并没有做什么用,它打印出对话框“ [bot]已经发过推文:(帖子)”,正如我对其编程的那样,事实是它不会搜索其他帖子。
将代码缩进至True时也未显示任何结果,并且该机器人将再次打印出该对话框,无论我允许它运行多长时间。
import praw
import json
import requests
import tweepy
import time
import os
import urllib.parse
from glob import glob
ACCESS_TOKEN = 'X'
ACCESS_TOKEN_SECRET = 'X'
CONSUMER_KEY = 'X'
CONSUMER_SECRET = 'X'
SUBREDDIT_TO_MONITOR = ''
IMAGE_DIR = 'img'
POSTED_CACHE = 'posted_posts.txt'
TWEET_SUFFIX = ''
TWEET_MAX_LEN = 280
DELAY_BETWEEN_TWEETS = 1
T_CO_LINKS_LEN = 12
def setup_connection_reddit(subreddit):
print('[bot] Setting up connection with reddit')
reddit_api = praw.Reddit(
user_agent='Reddit Twitter Tool Monitoring {}'.format(SUBREDDIT_TO_MONITOR),
client_id='X',
client_secret='X')
return reddit_api.subreddit(SUBREDDIT_TO_MONITOR)
def tweet_creator(subreddit_info):
post_dict = {}
post_ids = []
print('[bot] Getting posts from reddit')
for submission in subreddit_info.hot(limit=10):
if not already_tweeted(submission.id):
post_dict[submission.title] = {}
post = post_dict[submission.title]
post['link'] = submission.url
post['img_path'] = get_image(submission.url)
post_ids.append(submission.id)
else:
print('[bot] Already tweeted: {}'.format(str(submission)))
return post_dict, post_ids
def already_tweeted(post_id):
found = False
with open(POSTED_CACHE, 'r') as in_file:
for line in in_file:
if post_id in line:
found = True
break
return found
def strip_title(title, num_characters):
if len(title) <= num_characters:
return title
else:
return title[:num_characters - 1] + '…'
def get_image(img_url):
if 'i.redd.it' in img_url:
file_name = os.path.basename(urllib.parse.urlsplit(img_url).path)
img_path = IMAGE_DIR + '/' + file_name
print('[bot] Downloading image at URL ' + img_url + ' to ' + img_path)
resp = requests.get(img_url, stream=True)
if resp.status_code == 200:
with open(img_path, 'wb') as image_file:
for chunk in resp:
image_file.write(chunk)
return img_path
else:
print('[bot] Image failed to download. Status code: ' + resp.status_code)
else:
print('[bot] Post doesn\'t point to an i.redd.it link')
return ''
def tweeter(post_dict, post_ids):
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
for post, post_id in zip(post_dict, post_ids):
img_path = post_dict[post]['img_path']
extra_text = ' ' + post_dict[post]['link'] + TWEET_SUFFIX
extra_text_len = 1 + T_CO_LINKS_LEN + len(TWEET_SUFFIX)
if img_path:
extra_text_len += T_CO_LINKS_LEN
post_text = strip_title(post, TWEET_MAX_LEN - extra_text_len) + extra_text
print('[bot] Posting this link on Twitter')
print(post_text)
if img_path:
print('[bot] With image ' + img_path)
api.update_with_media(filename=img_path, status=post_text)
else:
api.update_status(status=post_text)
log_tweet(post_id)
time.sleep(DELAY_BETWEEN_TWEETS)
def log_tweet(post_id):
with open(POSTED_CACHE, 'a') as out_file:
out_file.write(str(post_id) + '\n')
def main():
if not os.path.exists(POSTED_CACHE):
with open(POSTED_CACHE, 'w'):
pass
if not os.path.exists(IMAGE_DIR):
os.makedirs(IMAGE_DIR)
subreddit = setup_connection_reddit(SUBREDDIT_TO_MONITOR)
post_dict, post_ids = tweet_creator(subreddit)
tweeter(post_dict, post_ids)
for filename in glob(IMAGE_DIR + '/*'):
os.remove(filename)
if __name__ == '__main__':
main()
这是代码,我猜想它与“ tweeter_creator”或“ already_tweeted”有关。 我会感谢任何人的帮助。
此外,如果您知道与此问题相关的任何优秀教程,都可以将其链接起来,我很想自己学习。
谢谢!