我正在尝试抓取一个提供BASE_URL / id格式文件的站点,该站点会将(302)重定向到实际文件或登录页面(我现在暂时跳过后者)。我的基本蜘蛛正在工作,但速度相当慢(〜20页/分钟)。应该AutoThrottle extension应该可以,但我没有观察到。我在做错什么吗?
我正在将Scrapy 2.1.0与Python 3.8.2结合使用。
这是我当前的代码:
class DocsSpider(scrapy.Spider):
name = "docs"
handle_httpstatus_list = [302]
custom_settings = {
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_TARGET_CONCURRENCY': 10,
}
def __init__(self, start="0", count="1000", *args, **kwargs):
super(DocsSpider, self).__init__(*args, **kwargs)
self.start = int(start)
self.count = int(count)
def start_requests(self):
for i in range(self.start, self.start+self.count):
yield scrapy.Request(url=BASE_URL+str(i), method='HEAD', callback=self.parse)
def parse(self, response):
if response.status == 302:
if 'Location' not in response.headers:
return
location = safe_url_string(response.headers['Location'])
for w in SKIP_WORDS:
if w in location:
return
self.logger.debug("redirect %r -> %r" % (response.url, location))
yield scrapy.Request(
response.urljoin(location),
callback=self.parse)
return
parts = urlparse(response.url)
p = parts.path
n = parts.netloc
if p.endswith(".html"):
return
filename = os.path.basename(p)
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
if "Content-Type" in response.headers:
ct = response.headers["Content-Type"].decode('utf-8')
else:
ct = "<unknown>"
yield {
'filename': filename,
'path': p,
'url': response.url,
'content-type': ct
}