scrapy traceback 302和404

时间:2016-10-13 16:47:14

标签: python request scrapy response

当我使用Scrapy模拟用户登录douban weibsite时出现问题。像这样的初始信息:

name = "douban_movie"
allowed_domains = ["douban.com"]
start_urls = (
     'https://movie.douban.com/top250',
 )
douban_url = "https://movie.douban.com/top250"
login_url = "https://accounts.douban.com/login?source=movie"
login_url2 = "https://accounts.douban.com/register"

在下载CAPTCHA图片后我输入:

def start_requests(self):
    yield scrapy.Request(
        url=self.login_url,
        headers=self.headers_dict,
        meta={
            "proxy": proxy,
            "cookiejar": 1
        },
        callback=self.request_captcha
    )

def request_captcha(self, response):
    print("request_captcha")
    sel = Selector(response)
    self.captcha_url = sel.xpath('//img[@id="captcha_image"]/@src').extract()

    if self.captcha_url!=[]:
        print self.captcha_url

        yield scrapy.Request(
                             url = self.captcha_url[0],
                             headers= self.headers_dict,
                             meta = {
                                 "proxy": proxy,
                                 "cookiejar": response.meta["cookiejar"],
                                 "dont_redirect": True,
                                 "handle_httpstatus_list": [302]
                             },
                             callback=self.download_captcha
        )


    yield scrapy.FormRequest(
                             url = self.login_url,
                             headers= self.headers_dict,
                             formdata={
                                 "form_email": email,
                                 "form_password": password,
                             },
                             meta = {
                                 "proxy": proxy,
                                 "cookiejar": response.meta["cookiejar"],
                                 "dont_redirect": True,
                                 "handle_httpstatus_list": [302]
                             },
                             callback=self.request_douban
                            )

def download_captcha(self, response):
    print("download_captcha")
    url = self.captcha_url[0]
    data = urllib.urlopen(url).read() 
    f = file("captcha.jpeg","wb")
    f.write(data)
    f.close() 
    os.popen('captcha.jpeg')
    print "please input:\n"
    captcha = raw_input()
    id1=self.captcha_url[0].split('?')[1]
    print id1
    yield scrapy.FormRequest(
                             url= self.login_url,
                             headers= self.headers_dict,
                             formdata={
                                 "form_email": email,
                                 "form_password": password,
                                 "captcha-solution": captcha,
                                 "captcha-id":id1.split('=')[1]
                             },
                             meta = {
                                 "proxy": proxy,
                                 "cookiejar": response.meta["cookiejar"],
                             },
                             callback=self.request_douban
                            )
def request_douban(self, response):
    print("request_douban")
    yield scrapy.Request(url = self.douban_url,
                        headers = self.headers_dict,
                        meta = {
                                "proxy": proxy,
                                "cookiejar": response.meta["cookiejar"],
                                "from": {"sign": "else", "data": {}},
                            },
                        callback = self.end_login,
                        dont_filter = True)




def end_login(self, response):
    print("end_login")
    print (response.body)
    with open("base.html", "wb") as fp:
        fp.write(response.body)

下载图片后可以打印,并且可以打印id1,但是当请求url = self.login_url,我的电子邮件和密码是正确的时候回溯404,是否有任何问题?或请求将表单发送至https://accounts.douban.com/login?source=movie的网址(https://accounts.douban.com/login)。

0 个答案:

没有答案