我正在使用scrapy抓取一个大型网站,我正在寻找一种方法来拒绝属性class =“AdvSearchKeyword_clearall”的所有标签。
如果无法拒绝带有class =“AdvSearchKeyword_clearall”的<a>
标记,是否可以解析“AdvSearchKeyword_clearall”以过滤掉具有特定属性的所有链接?
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class ComSpider(CrawlSpider):
name = "browsepages"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com",]
rules = (
Rule (SgmlLinkExtractor(allow=('/browse/', ),)
, callback="parse_items", follow= True),
Rule(SgmlLinkExtractor(allow=(),unique=True,deny=('/[1-9]$', '(bti=)[1-9]+(?:\.[1-9]*)?', '(sort_by=)[a-zA-Z]', '(sort_by=)[1-9]+(?:\.[1-9]*)?', '(ic=32_)[1-9]+(?:\.[1-9]*)?', '(ic=60_)[0-9]+(?:\.[0-9]*)?', '(search_sort=)[1-9]+(?:\.[1-9]*)?', 'browse-ng.do\?', '/page/', '/ip/', 'out\+value', 'fn=', 'customer_rating', 'special_offers', 'search_sort=&', 'facet=' ))),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
item['robots'] = site.select('//meta[@name="robots"]/@content').extract()
items.append(item)
return items
答案 0 :(得分:3)
您可以使用以下代码覆盖SgmlLinkExtractor
:
class ExtendedSgmlLinkExtractor(SgmlLinkExtractor):
def __init__(self, deny_xpaths=(), *args, **kwargs):
self.deny_xpaths = deny_xpaths
super(ExtendedSgmlLinkExtractor, self).__init__(*args, **kwargs)
def extract_links(self, response):
from scrapy.utils.response import get_base_url
base_url = None
if self.restrict_xpaths:
sel = Selector(response)
base_url = get_base_url(response)
body = u''.join(f
for x in self.restrict_xpaths
for f in sel.xpath(x).extract()
).encode(response.encoding)
else:
body = response.body
if self.deny_xpaths:
sel = Selector(response)
base_url = get_base_url(response)
body = u''.join(f
for x in self.deny_xpaths
for f in sel.xpath(x).extract()
).encode(response.encoding)
links = self._extract_links(body, response.url, response.encoding, base_url)
links = self._process_links(links)
return links
然后将其与包含not()
运算符的xpath一起使用,在您的情况下 - //a[not(contains(@class, 'AdvSearchKeyword_clearall'))]
如:
rules = (Rule(ExtendedSgmlLinkExtractor(…, deny_xpaths=('//a[not(contains(@class, 'AdvSearchKeyword_clearall'))]',),)
虽然该代码尚未经过测试,但无论如何都请反馈。