我想使用抓取蜘蛛和规则I实现抓取到所有网页,但无法获取通过规则提取的数据不知道为什么?
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=1"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="pagelinks"]', ), allow=('page=\d+', )),follow=True, callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
sites = response.xpath('//table[@width="100%"]')
items=[]
for site in sites:
item['title'] = response.xpath('.//td[@class="complaint"]/a/span/text()').extract()[0]
# item['subtitle'] = response.xpath('<write xpath>').extract()[0]
# item['date'] = response.xpath('<write xpath>').extract()[0].strip()
# item['username'] = response.xpath('<write xpath>').extract()[0]
#item['link'] = response.url
# item['data'] = response.xpath('<write xpath>').extract()[0]
items.append(item)
return items