我正在尝试从具有某些参数的请求中启动抓取抓取工具:
msg_req_obj = MessageRequestObject(azureServiceBus=self.azure_service_bus,
sbReqQ=self.sb_request_queue,
sbResQ=self.sb_response_queue,
session=message_body['session'],
studyName=message_body['studyName'],
studyId=message_body['studyId'],
strategyId=message_body['strategyId'],
requestId=message_body['requestId'],
email=message_body['email'],
crawlDepth=message_body['crawlDepth'],
crawlPageCount=message_body['crawlPageCount'],
sites=site_obj_array,
msg=message)
该消息基本上传递了第一个URL,以启动蜘蛛,并随每个创建的蜘蛛改变了两个设置: crawlDepth 和 crawlPageCount 。
我有以下几种获取设置的方法:
DEPTH_LIMIT = 3
DEPTH_STATS_VERBOSE = True
config_settings.py文件,包括一些用于覆盖settings.py的设置。
def depth_limit(self):
_default_depth_limit = 4
if (self._depth_limit):
try:
return int(self._depth_limit)
except Exception as ex:
logger.error('"DEPTH_LIMIT" is not a number in application settings. Using default value "' + str(_default_depth_limit) + '"')
return _default_depth_limit
else:
print('"DEPTH_LIMIT" not found/empty in application settings. Using default value "' + str(_default_depth_limit) + '"')
return _default_depth_limit
custom_settings
蜘蛛中的settings.py
中的设置覆盖config_settings.py
文件。
custom_settings = {
'ROBOTSTXT_OBEY': configurationSettings.obey_robotstxt,
'DEPTH_LIMIT': configurationSettings.depth_limit,
'DOWNLOAD_DELAY': configurationSettings.download_delay_for_requests,
'CLOSESPIDER_PAGECOUNT': configurationSettings.max_responses_to_crawl
}
get_project_settings()
获取设置以检索默认设置,然后使用settings.update()
方法对其进行更新,传递来自msg_req_obj
的值,然后启动crawlerRunner
使用以下更新的设置: def crawl_sites(self, blob_config, site_urls, msg_req_obj):
print("SPIDER STARTED")
print(site_urls)
s = get_project_settings()
s.update({
"DEPTH_LIMIT" : msg_req_obj.crawlDepth,
"MAX_RESPONSES_TO_CRAWL" : msg_req_obj.crawlPageCount,
})
self.runner = CrawlerRunner(s)
self.runner.crawl(GenericSpider,
blobConfig=blob_config,
msgReqObj=msg_req_obj,
urls=site_urls)
deferred = self.runner.join()
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
最后一个有效地更改了将要传递到crawlerRunner
的设置。但是Spider不会加载这些设置,而是以DEPTH_LIMIT=1
开头。
我尝试通过其他所有方法(settings.py
,config_settings.py
和custom_settings
)对DEPTH_LIMIT的不同值进行硬编码,但它们似乎都不起作用,因为蜘蛛程序始终会爬行到深度为1的项目在停止和关闭之前。这样看来,蜘蛛程序没有采取任何这些设置,而是“默认设置”为DEPTH_LIMIT = 1。
我想念什么?为了实现此功能,我还应该采取其他步骤吗?
编辑:
这是我的crawlProcess
类的代码:
class CrawlProcess(object):
"""description of class"""
def __init__(self, blob_service, blob_service_output_container_name):
"""
Constructor
"""
self.blob_service = blob_service
self.blob_service_output_container_name = blob_service_output_container_name
settings_file_path = 'scrapy_app.scrapy_app.settings' # The path seen from root, ie. from crawlProcess.py
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
self.runner = ''
def spider_closing(self, spider):
"""Activates on spider closed signal"""
print("STOPPING SPIDER")
self.runner.join()
def crawl_sites(self, blob_config, site_urls, msg_req_obj):
print("SPIDER STARTED")
print(site_urls)
s = get_project_settings()
s.update({
"DEPTH_LIMIT" : msg_req_obj.crawlDepth,
"MAX_RESPONSES_TO_CRAWL" : msg_req_obj.crawlPageCount,
})
self.runner = CrawlerRunner(s)
self.runner.crawl(GenericSpider,
blobConfig=blob_config,
msgReqObj=msg_req_obj,
urls=site_urls)
deferred = self.runner.join()
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
def start_process(self, site_urls, msg_req_obj):
blob_config = BlobConfig(blob_service=self.blob_service, blob_container_name=self.blob_service_output_container_name,)
crawl_sites_process = mp.Process(target=self.crawl_sites, args=(blob_config, site_urls, msg_req_obj), daemon=True)
print("STARTING SPIDER")
crawl_sites_process.start()
crawl_sites_process.join()
print("SPIDER STOPPED")
print("ENGINE STOPPED")
这是我的GenericSpider
的代码:
import scrapy
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
try:
from scrapy_app.scrapy_app.items import HtmlItem
except ImportError:
from scrapy_app.items import HtmlItem
import re
import os
import json
from scrapy_splash.response import SplashJsonResponse
from scrapy.spiders import CrawlSpider, Rule
from scrapy_app.scrapy_app.utils import get_domain_from_url, get_subdomain_from_url
import logging
from config_settings import ConfigurationSettings
from scrapy_splash import SplashRequest
logger = logging.getLogger(__name__)
class GenericSpider(CrawlSpider):
extractor = LinkExtractor()
crawl_depth = 0
name = 'generic'
configurationSettings = ConfigurationSettings.getInstance()
handle_httpstatus_list = configurationSettings.handle_http_statuses
handle_httpstatus_all = configurationSettings.handle_all_http_statuses
custom_settings = {
'ROBOTSTXT_OBEY': configurationSettings.obey_robotstxt,
'DEPTH_LIMIT': configurationSettings.depth_limit,
'DOWNLOAD_DELAY': configurationSettings.download_delay_for_requests,
'CLOSESPIDER_PAGECOUNT': configurationSettings.max_responses_to_crawl
}
logger.setLevel(logging.INFO)
logging.basicConfig(
filename='scraping.log',
format='%(levelname)s: %(message)s',
level=logging.INFO
)
def __init__(self, crawler, *args, **kwargs):
self.crawler = crawler
self.blobConfig = kwargs.get('blobConfig')
self.msgReqObj = kwargs.get('msgReqObj')
self.urls = kwargs.get('urls')
self.allowed_domains = [urlparse(url).netloc for url in self.urls]
self.start_urls = self.urls
self.proxy_pool = self.configurationSettings.proxies
self.suggestedKeywords = self.configurationSettings.suggested_keywords
self.rules = [Rule(LinkExtractor(allow=(), allow_domains=self.allowed_domains,
canonicalize=True, unique=True,),
follow=True, callback="parse_item", process_request="use_splash_request"),]
self._follow_links = True
self._domain = ""
self.failed_urls_dict = {}
for httpstatus in self.handle_httpstatus_list:
self.failed_urls_dict[httpstatus] = []
super(GenericSpider, self).__init__(crawler, *args, **kwargs)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
# settings = crawler.settings
return cls(crawler, *args, **kwargs)
def parse_item(self, response):
# if self.handle_httpstatus_all or response.status not in self.handle_httpstatus_list: # Without this line, ALL HTTP Responses are handleds
item = self._get_item(response)
yield item
def _get_item(self, response):
children = []
# Get parameters from the Response
_domain = response.meta['url_domain'] if 'url_domain' in response.meta else get_domain_from_url(response.url)
_subdomain = get_subdomain_from_url(response.url, _domain)
_comparableId = response.meta['comparableId'] if 'comparableId' in response.meta else 'NA'
root = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(response.url))
_html = response.text
base_tag = response.css("head base").extract()
if not base_tag:
_html = _html.replace("</head>", "<base href=\"" + root + "/\"></head>")
#Populate Child pages List
links = self.extractor.extract_links(response)
[children.append(link.url) for link in links]
item = HtmlItem(
url=response.url,
domain=_domain,
subdomain=_subdomain,
html=_html,
description='',
title='',
is_suggested=str(False),
comparable_id=str(_comparableId),
is_error=str(False) if 200 <= response.status < 300 else str(True),
http_status=response.status,
crawl_depth=response.meta['depth'],
child_pages=children
)
self._set_title(item, response)
self._set_description(item, response)
self._is_suggested(item)
return item
def _set_title(self, item, response):
if isinstance(response, SplashJsonResponse) or response.meta['isFirstPage'] == True:
title = response.css("title::text").extract()
if title:
item['title'] = title[0].encode("utf-8")
else:
pass
def _set_description(self, item, response):
if isinstance(response, SplashJsonResponse):
meta_description = response.css("meta[name=\"description\"]::attr(content)").extract()
if meta_description:
item['description'] = meta_description[0].encode("utf-8")
def _is_suggested(self, item):
#logger.info('TITLE-DESCRIPTION:- %(title)s ==> %(desc)s', {'title': item['title'], 'desc': item['description']})
_title = item['title'].decode("utf-8") if item['title'] else ''
_description = item['description'].decode("utf-8") if item['description'] else ''
try :
if any(re.search(r'\b' + sug_kwd + r'\b', _title, re.IGNORECASE) for sug_kwd in self.suggestedKeywords) \
or any(re.search(r'\b' + sug_kwd + r'\b', _description, re.IGNORECASE) for sug_kwd in self.suggestedKeywords):
item['is_suggested'] = str(True)
except Exception as ex:
template = "GenericSpider:- An exception of type {0} occurred. Arguments:\n{1!r}"
ex_message = template.format(type(ex).__name__, ex.args)
print(ex_message)