我的蜘蛛出了什么问题?

时间:2015-01-30 05:33:07

标签: scrapy web-crawler

enter image description here我想使用抓取蜘蛛和规则I实现抓取到所有网页,但无法获取通过规则提取的数据不知道为什么?

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem




class criticspider(CrawlSpider):
    name = "delh"
    allowed_domains = ["consumercomplaints.in"]
    start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=1"]

    rules = (
        # Extracting pages, allowing only links with page=number to be extracted 
        Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="pagelinks"]', ), allow=('page=\d+', )),follow=True, callback='parse_item'),
    )

    def parse_item(self, response):
        item = DelhiveryItem()
        sites = response.xpath('//table[@width="100%"]')
        items=[]
        for site in sites:
            item['title'] = response.xpath('.//td[@class="complaint"]/a/span/text()').extract()[0]
            # item['subtitle'] = response.xpath('<write xpath>').extract()[0]
            # item['date'] = response.xpath('<write xpath>').extract()[0].strip()
            # item['username'] = response.xpath('<write xpath>').extract()[0]
            #item['link'] = response.url
            # item['data'] = response.xpath('<write xpath>').extract()[0]
            items.append(item)
        return items

0 个答案:

没有答案