我正在尝试抓取网站的所有内部链接,但是遇到一些问题,使这些链接似乎通过JavaScript生成,因此它们通过scrapy进行抓取。 https://www.vecteezy.com/vector-art/274468-wavy-lines-pastel-background
是此站点示例的一页我尝试运行以下代码,但没有运气获得页面上名为“ 此图片出现在搜索中”的链接/标签
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
name = "vecteezycrawl"
allowed_domains = ["www.vecteezy.com"]
start_urls = ["https://www.vecteezy.com"]
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse,
dont_filter=True)
def parse_items(self, response):
items = []
links = LinkExtractor(canonicalize=True,
unique=True).extract_links(response)
is_allowed = False
for allowed_domain in self.allowed_domains:
if allowed_domain in link.url:
is_allowed = True
if is_allowed:
item = DatabloggerScraperItem()
item['url_from'] = response.url
item['url_to'] = link.url
items.append(item)
return items
我要获取的链接将是您在此图片下方看到的标签,它们出现在搜索中:柔和的背景,柔和的背景,抽象,波浪,线条,对角线,无缝。等。
看起来像经过一些更新之后,我可以开始进行更多的爬网,但是现在它不会对整个网站进行爬网?这是抓取完成后的转储
信息:倾销令人不快的统计信息:
{'downloader/exception_count': 10746,
'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 10742,
'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 4,
'downloader/request_bytes': 7457136941,
'downloader/request_count': 233585,
'downloader/request_method_count/GET': 233585,
'downloader/response_bytes': 929365695,
'downloader/response_count': 233581,
'downloader/response_status_count/200': 23618,
'downloader/response_status_count/301': 23,
'downloader/response_status_count/302': 6,
'downloader/response_status_count/400': 183088,
'downloader/response_status_count/500': 26806,
'downloader/response_status_count/504': 40,
'dupefilter/filtered': 1797531,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 2, 7, 21, 11, 10, 143192),
'httperror/response_ignored_count': 184160,
'httperror/response_ignored_status_count/400': 183088,
'httperror/response_ignored_status_count/500': 1072,
'item_scraped_count': 2066100,
'log_count/DEBUG': 2311531,
'log_count/INFO': 184277,
'offsite/domains': 30,
'offsite/filtered': 505814,
'request_depth_max': 211,
'response_received_count': 207778,
'retry/count': 25778,
'retry/max_reached': 1072,
'retry/reason_count/500 Internal Server Error': 25734,
'retry/reason_count/504 Gateway Time-out': 40,
'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 4,
'scheduler/dequeued': 244326,
'scheduler/dequeued/memory': 244326,
'scheduler/enqueued': 244326,
'scheduler/enqueued/memory': 244326,
'start_time': datetime.datetime(2019, 2, 7, 19, 21, 28, 203714)}