我使用Scrapy制作了一个蜘蛛,在访问我要抓取的主要网站之前,它首先在重定向的地址中解决了验证码问题。它说我有一个导致无限循环的HTTP错误,但是我找不到导致此问题的脚本部分。
在中间件中:
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
class ProtectRedirectMiddleware(RedirectMiddleware):
def __init__(self, settings):
super().__init__(settings)
self.source = urllib.request.urlopen('http://sampleurlname.com/')
soup = BeautifulSoup(source, 'lxml')
def _redirect(self, redirected, request, spider, reason):
# act normally if this isn't a CAPTCHA redirect
if not self.is_protected(redirected.url):
return super()._redirect(redirected, request, spider, reason)
# if this is a CAPTCHA redirect
logger.debug(f'The protect URL is triggered for {request.url}')
request.cookies = self.bypass_protection(redirected.url)
request.dont_filter = True
return request
def is_protected(self, url):
return 'sampleurlname.com/protect' in url
def bypass_protection(self, url=None):
# only navigate if any explicit url is provided
if url:
url = url or self.source.geturl(url)
img = soup.find_all('img')[0]
imgurl = img['src']
urllib.request.urlretrieve(imgurl, "captcha.png")
return self.solve_captcha(imgurl)
# wait for the redirect and try again
self.wait_for_redirect()
return self.bypass_protection()
def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
url = self.url
for i in range(int(timeout//wait)):
time.sleep(wait)
if self.response.url() != url:
return self.response.url()
logger.error(f'Maybe {self.response.url()} isn\'t a redirect URL')
raise Exception('Timed out')
def solve_captcha(self, img, width=150, height=50):
# open image
self.img = 'captcha.png'
img = Image.open("captcha.png")
# image manipulation - simplified
# input the captcha text - simplified
# click the submit button - simplified
# save the URL
url = self.response.url()
# try again if wrong
if self.is_protected(self.wait_for_redirect(url)):
return self.bypass_protection()
# return the cookies as a dict
cookies = {}
for cookie_string in self.response.css.cookies():
if 'domain=sampleurlname.com' in cookie_string:
key, value = cookie_string.split(';')[0].split('=')
cookies[key] = value
return cookies
然后,这是我对蜘蛛进行抓取抓取时遇到的错误:
Unhandled error in Deferred:
2018-08-06 16:34:33 [twisted] CRITICAL: Unhandled error in Deferred:
2018-08-06 16:34:33 [twisted] CRITICAL:
Traceback (most recent call last):
File "/username/anaconda/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "/username/anaconda/lib/python3.6/site-packages/scrapy/crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/username/anaconda/lib/python3.6/site-packages/scrapy/core/engine.py", line 69, in __init__
self.downloader = downloader_cls(crawler)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/core/downloader/__init__.py", line 88, in __init__
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/downloadermiddlewares/redirect.py", line 26, in from_crawler
return cls(crawler.settings)
File "/username/...../scraper/myscraper/myscraper/middlewares.py", line 27, in __init__
self.source = urllib.request.urlopen('http://sampleurlname.com/')
File "/username/anaconda/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 564, in error
result = self._call_chain(*args)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 756, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 532, in open
它基本上一遍又一遍地重复这些内容的底部:打开,http_response,错误,_call_chain和http_error_302,直到这些内容显示在最后:
File "/username/anaconda/lib/python3.6/urllib/request.py", line 746, in http_error_302
self.inf_msg + msg, headers, fp)
urllib.error.HTTPError: HTTP Error 307: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Temporary Redirect
在setting.py中为:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
'myscrape.middlewares.ProtectRedirectMiddleware': 600}
答案 0 :(得分:1)
您的问题与抓痒本身无关。您正在中间件启动中使用阻止请求。 该请求似乎卡在了重定向循环中。当网站无法正常运行并且需要Cookie来允许您通过以下操作时,通常会发生这种情况:
Cookies
标头,并且页面允许您通过Python urllib无法处理cookie,因此请尝试以下操作:
import urllib
from http.cookiejar import CookieJar
def __init__(self):
try:
req=urllib.request.Request(url)
cj = CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
response = opener.open(req)
source = response.read().decode('utf8', errors='ignore')
response.close()
except urllib.request.HTTPError as e:
logging.error(f"couldn't initiate middleware: {e}")
return
# you should use scrapy selectors instead of beautiful soup here
#soup = BeautifulSoup(source, 'lxml')
selector = Selector(text=source)
或者,您应该使用requests
程序包,该程序包自己处理cookie。