获得以下代码:
from scrapy.http import Request, FormRequest
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
import subprocess
class LoginSpider(CrawlSpider):
name = 'loginspider'
login_page = 'http://145.100.108.148/login5/login.php'
start_urls = ['http://145.100.108.148/login5/index.php']
url = 'http://145.100.108.148'
username = 'test@hotmail.com'
password = 'test'
def init_request(self):
return Request(url=self.login_page, callback=self.start_requests)
def start_requests(self):
yield Request(
url = self.login_page,
callback = self.login,
dont_filter = True
)
def login(self, response):
print('\n Response object here')
print(response)
if response.css("#captcha").extract_first() is not None:
captchaImageLink = self.url + response.css("#captcha::attr(src)").extract_first()
yield Request(captchaImageLink, callback = self.saveCaptchaImage)
def saveCaptchaImage(self, response):
output = open("image.png","wb")
output.write(response.body)
output.close()
print('\n Response object here, it')
print(response)
captcha = 'abcdef'
print ("\n Login is here! \n")
return FormRequest.from_response(response,
formdata={ 'email': self.username,
'pass': self.password,
'CaptchaCode': captcha},
callback=self.check_login_response)
def check_login_response(self, response):
print ("\n Check_login_response \n")
if b"Learn" in response.body:
print("Worked, logged in")
#return self.parse_item
else:
print("Not logged in")
return
执行此代码后,python将发出错误; 'Response' object has no attribute 'encoding'
。
原因是,formdata
包含尝试填充对象的属性,例如encoding
。
但是,自从我尝试下载图片后,该对象已从<200 http://145.100.108.148/login5/login.php>
更改为<200 http://145.100.108.148/login5/simple-php-captcha.php?_CAPTCHA&t=0.65214000+1517496701>
如何在不执行其他请求的情况下将对象更改回原始状态?因为我想留在同一个会话中解决验证码。
答案 0 :(得分:0)
必须按以下方式保存:
self.loginpage = response
return FormRequest.from_response(self.loginpage,
formdata={ 'email': self.username,
'pass': self.password,
'CaptchaCode': captcha},
callback=self.check_login_response)
那是