当我使用Scrapy模拟用户登录douban weibsite时出现问题。像这样的初始信息:
name = "douban_movie"
allowed_domains = ["douban.com"]
start_urls = (
'https://movie.douban.com/top250',
)
douban_url = "https://movie.douban.com/top250"
login_url = "https://accounts.douban.com/login?source=movie"
login_url2 = "https://accounts.douban.com/register"
在下载CAPTCHA图片后我输入:
def start_requests(self):
yield scrapy.Request(
url=self.login_url,
headers=self.headers_dict,
meta={
"proxy": proxy,
"cookiejar": 1
},
callback=self.request_captcha
)
def request_captcha(self, response):
print("request_captcha")
sel = Selector(response)
self.captcha_url = sel.xpath('//img[@id="captcha_image"]/@src').extract()
if self.captcha_url!=[]:
print self.captcha_url
yield scrapy.Request(
url = self.captcha_url[0],
headers= self.headers_dict,
meta = {
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
"dont_redirect": True,
"handle_httpstatus_list": [302]
},
callback=self.download_captcha
)
yield scrapy.FormRequest(
url = self.login_url,
headers= self.headers_dict,
formdata={
"form_email": email,
"form_password": password,
},
meta = {
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
"dont_redirect": True,
"handle_httpstatus_list": [302]
},
callback=self.request_douban
)
def download_captcha(self, response):
print("download_captcha")
url = self.captcha_url[0]
data = urllib.urlopen(url).read()
f = file("captcha.jpeg","wb")
f.write(data)
f.close()
os.popen('captcha.jpeg')
print "please input:\n"
captcha = raw_input()
id1=self.captcha_url[0].split('?')[1]
print id1
yield scrapy.FormRequest(
url= self.login_url,
headers= self.headers_dict,
formdata={
"form_email": email,
"form_password": password,
"captcha-solution": captcha,
"captcha-id":id1.split('=')[1]
},
meta = {
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
},
callback=self.request_douban
)
def request_douban(self, response):
print("request_douban")
yield scrapy.Request(url = self.douban_url,
headers = self.headers_dict,
meta = {
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
"from": {"sign": "else", "data": {}},
},
callback = self.end_login,
dont_filter = True)
def end_login(self, response):
print("end_login")
print (response.body)
with open("base.html", "wb") as fp:
fp.write(response.body)
下载图片后可以打印,并且可以打印id1,但是当请求url = self.login_url,我的电子邮件和密码是正确的时候回溯404,是否有任何问题?或请求将表单发送至https://accounts.douban.com/login?source=movie的网址(https://accounts.douban.com/login)。