假设这是我的代码
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from dmoz.items import DmozItem
class DmozSpider(BaseSpider):
domain_name = "dmoz.org"
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul[2]/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/@href').extract()
item['desc'] = site.select('text()').extract()
items.append(item)
return items
SPIDER = DmozSpider()
如果我使用了crawlSpider,那么我可以使用规则来实现链接提取器,但是我如何在基本蜘蛛中提及规则。就像上面的例子一样。因为规则只适用于crawlspider而不是基础蜘蛛
答案 0 :(得分:0)
也许您可以解析规则标准的响应,然后将成功的响应传递给第二个回调?下面的伪代码:
def parse(self, response):
# check response for rule criteria
...
if rule:
# create new request to pass to second callback
req = Request("http://www.example.com/follow", callback=self.parse2)
return req
def parse2(self, response):
hxs = HtmlXPathSelector(response)
# do stuff with the successful response