我正在制作蜘蛛以抓取reddit.com(https://www.reddit.com/)。鉴于站点的结构,您必须向“ https://www.reddit.com/login?dest=https%3A%2F%2Fwww.reddit.com%2F”发送请求。一切正常,但是当我在此页面上向上述URL提交表单请求时,我失败了,出现500个内部服务器错误。
我的代码如下:
class reddit_spider(scrapy.Spider):
name = "reddit"
main_url = 'https://www.reddit.com/'
def __init__(self, category = None, username = None, password = None):
self.category = category
self.username = username
self.password = password
def start_requests(self):
yield scrapy.Request(self.main_url, callback = self.login_parse)
def login_parse(self,response):
#gets the login page from reddit
login_url = response.selector.xpath('.//a[text() = "log in"]/@href').extract_first()
def login(response):
csrf_token = response.selector.xpath('.//form/input[@name = "csrf_token"]/@value').extract_first()
print("csrf_token : " + csrf_token)
print(response.url)
#500 Internal server error here!
formdata = {'csrf_token' : csrf_token, 'username' : self.username, 'password' : self.password}
yield scrapy.FormRequest(url = response.url, formdata = formdata, callback = self.mainpage_parse) #callback will be mainpge parse
yield response.follow(login_url, callback = login)
def mainpage_parse(self,response):
if (response.selector.xpath('.//div[text() = "{username}"'.format(username = self.username))):
print(response.selector.xpath('.//div[text() = "{username}"'.format(username = self.username)))
else:
print("didnt find username")
该如何解决?