我已经实现了自定义中间件,试图在使用Scrapy进行爬网时过滤掉重复的网址,但似乎没有为每个网址调用request_seen方法。
我的中间件写入txt文件,以便在不同的爬网中维护一个唯一的URL列表。但是,没有为每个URL调用中间件。我找到了一个这样的URL并包含了print语句来跟踪它的路径; Spider中的print语句正在打印到控制台,但中间件中没有。以下是我的代码。
蜘蛛
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import SBItem
class SBSpider(CrawlSpider):
name = "sales-betty"
allowed_domains = ["www.med.nyu.edu"]
start_urls = ["http://www.med.nyu.edu/az-departments-divisions"]
rules = (Rule(LinkExtractor(), callback='parse_url', follow=True), )
custom_settings = {
'DEPTH_LIMIT': 1,
'DUPEFILTER_CLASS': 'salesbetty_scrapy.duplicate_filter.CustomFilter'
}
def parse_url(self, response):
for link in LinkExtractor(allow=self.allowed_domains, unique=True).extract_links(response):
item = SBItem()
item["url"] = link.url
if link.url == "http://www.med.nyu.edu/school/about-us":
print "FOUND ABOUT US IN CORE SCRAPER"
print link.url
中间件
import os
from scrapy.dupefilter import RFPDupeFilter
class CustomFilter(RFPDupeFilter):
"""A dupe filter that considers specific ids in the url"""
def request_seen(self, request):
if request.url == "http://www.med.nyu.edu/school/about-us":
print "FOUND ABOUT US"
visited_urls = self.get_unique_urls()
if request.url in visited_urls:
if request.url == "http://www.med.nyu.edu/school/about-us":
print "FOUND ABOUT US IN VISITED"
return True
else:
self.add_url(request.url)
if request.url == "http://www.med.nyu.edu/school/about-us":
print "FOUND ABOUT US FOR FIRST TIME"
return False
def get_unique_urls(self):
urls = []
if os.path.exists("visited_urls.txt"):
with open("visited_urls.txt") as f:
urls = f.readlines()
visited = set()
for url in urls:
visited.add(url)
return visited
def add_url(self, url):
with open("visited_urls.txt", 'a') as f:
f.write(url + "\n")
if url == "http://www.med.nyu.edu/school/about-us":
print "FOUND ABOUT US AND ADDED"