我想用Captcha形式抓取一个网站,似乎Captcha在页面加载时附加了会话cookie,意味着cookie从头开始创建而不是在提交表单之后,在第一个表单之后应该加载主要登录表单也有验证码,我以同样的方式处理它。 为了处理Captcha我使用PIL和pytesseract,它不能读取所有Captchas但它没关系因为我也下载图像并在pytesseract失败的情况下手动输入。 日志上收到和发送的cookie看起来是一样的,但我仍然得到错误的Captcha。 可能有什么不对? 这是我的代码:
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
import io
import urllib.request
from PIL import Image
import pytesseract
from scrapy.http.cookies import CookieJar
class CaptchaSpider(scrapy.Spider):
name = "captcha"
start_urls = ['http://example.com/']
def parse(self, response):
cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
cookieJar.extract_cookies(response, response.request)
request = Request('http://example.com/', callback= self.parse_captcha,
meta = {'dont_merge_cookies': True, 'cookie_jar': cookieJar})
cookieJar.add_cookie_header(request)
yield request
def parse_captcha(self, response):
captcha_url = response.urljoin(response.css('img').xpath('@src').extract_first())
url_opener = urllib.request.build_opener()
img_bytes = url_opener.open(captcha_url).read()
img = Image.open(io.BytesIO(img_bytes))
img.save("captcha", "JPEG")
captcha = pytesseract.image_to_string(img)
if not captcha:
captcha = input('Could not detect Captcha, enter it manually: ')
return scrapy.FormRequest.from_response(
response, formdata={'captcha': captcha},
meta={'cookie_jar': response.meta['cookie_jar']},
callback=self.after_first_captcha)
def after_first_captcha(self, response):
if b"errors" in response.body:
print("Captcha failed --->>> ", response.xpath('//li').extract())
return
else:
print("Captcha succeeded")
return Request(url="http://example.com/",
meta={'cookie_jar': response.meta['cookie_jar']},
callback=self.parse_login_captcha)