所以我建立了一个scrapy蜘蛛爬行网站内的所有内部链接。但是,当我运行蜘蛛时,有一些网站的大部分网站与网站的内容关系不大。例如,一个网站运行Jenkins,而我的蜘蛛花费大量时间来探索这些与网站无关的页面。
一种方法是制作黑名单并添加一些路径,如Jenkins,但我想知道是否有更好的方法来处理它。
class MappingItem(dict, BaseItem):
pass
class WebsiteSpider(scrapy.Spider):
name = "Website"
def __init__(self):
item = MappingItem()
self.loader = ItemLoader(item)
self.filter_urls = list()
def start_requests(self):
filename = "filename.csv"
try:
with open(filename, 'r') as csv_file:
reader = csv.reader(csv_file)
header = next(reader)
for row in reader:
seed_url = row[1].strip()
base_url = urlparse(seed_url).netloc
self.filter_urls.append(base_url)
request = Request(seed_url, callback=self.parse_seed)
request.meta['base_url'] = base_url
yield request
except IOError:
raise CloseSpider("A list of websites are needed")
def parse_seed(self, response):
base_url = response.meta['base_url']
# handle external redirect while still allowing internal redirect
if urlparse(response.url).netloc != base_url:
return
external_le = LinkExtractor(deny_domains=base_url)
external_links = external_le.extract_links(response)
for external_link in external_links:
if urlparse(external_link.url).netloc in self.filter_urls:
self.loader.add_value(base_url, external_link.url)
internal_le = LinkExtractor(allow_domains=base_url)
internal_links = internal_le.extract_links(response)
for internal_link in internal_links:
request = Request(internal_link.url, callback=self.parse_seed)
request.meta['base_url'] = base_url
request.meta['dont_redirect'] = True
yield request
答案 0 :(得分:0)
听起来像链接提取器的deny_domains
参数可供您用于不遵循的域名的“黑名单”:
deny_domains
(str或list) - 单个值或字符串列表 包含不会被考虑用于提取链接的域