当代的刮板能否推荐如何解决403错误并获得刮板?
我尝试使用Selenium,但无济于事。我还尝试将以下标头传递给每个响应:
headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
# -*- coding: utf-8 -*-
import scrapy
import re
import os
import wget
class LivelinguaCrawlerSpider(scrapy.Spider):
name = 'livelingua_crawler'
allowed_domains = ['www.livelingua.com']
start_urls = ['https://www.livelingua.com/project/']
def parse(self, response):
language_links = response.css("div.col-md-4 a::attr(href)").getall()
for link in language_links[2:4]:
language = re.match('(.*)(?<=courses)(.*)', link).group(2)[1:-1]
dir_path = "redacted/" + language
try:
os.makedirs(dir_path)
except FileExistsError:
pass
self.dir_path = dir_path
request = response.follow(link, self.parseCourses)
request.meta['dir'] = dir_path
yield request
def parseCourses(self, response):
courses = response.css("span.thumb-info-caption h6 a::attr(href)").getall()
dir_path = response.meta['dir']
for course in courses:
request = response.follow(course, self.parseEBooks)
request.meta['dir'] = dir_path
yield request
def parseEBooks(self, response):
eBooks = response.css("div.row:first-child ul li a::attr(href)").getall()
for eBook in eBooks:
wget.download(eBook, response.meta['dir'])
yield {"ebook": eBook}
尝试wget时出现以下错误:
wget.download(eBook, response.meta['dir'])
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/wget.py", line 526, in download
(tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 531, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 569, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
答案 0 :(得分:0)
如gangabass所建议,使用文件管道是这里的方法。 这很容易做到:
添加文件管道并在 settings.py 中设置包含要下载的网址的字段:
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
FILES_URLS_FIELD = 'ebook'
正如Roland Weber所说,当前尝试下载时出现的错误代码可能与标头有关。为了解决这个问题,我建议创建一个从FilesPipeline继承并覆盖get_media_requests方法的类:
class DownloadEbooksPipeline(FilesPipeline):
headers = {
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en;q=0.9,nl-BE;q=0.8,nl;q=0.7,ro-RO;q=0.6,ro;q=0.5,en-US;q=0.4',
}
def get_media_requests(self, item, info):
for ebook_url in item.get(self.files_urls_field, []):
request = Request(url=ebook_url,
headers=self.headers)
yield request
如果采用这种方式,则必须在settings.py中更改ITEM_PIPELINES才能引用包含此类的文件。