我想使用scrapy抓取完整的网站,但现在只抓取单页
import scrapy
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import JsonItemExporter
class IzodspiderSpider(scrapy.Spider):
name = 'izodspider'
allowed_domains = ['izod.com']
start_urls = ['http://izod.com/']
rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
def parse(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[@name=\'description\']/@content').extract()
name = hxs.xpath('//div[@id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[@id=\'product-details\']/p').extract()
有没有办法使用portia提取元标记?
答案 0 :(得分:0)
规则定义和回调内部存在错误。
由于您使用的解析函数是 parse_item ,您必须在回调中调用它而不是解析
文档中找到有关回调函数的更多信息class IzodspiderSpider(CrawlSpider):
name = "izod"
depth_limit= 0
bot_name = 'izod'
allowed_domains = ['izod.com']
start_urls = ['http://www.izod.com']
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[@name=\'description\']/@content').extract()
name = hxs.xpath('//div[@id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[@id=\'product-details\']/p').extract()