Scrapy python规则不起作用

时间:2016-11-06 16:39:05

标签: python scrapy

我可以废弃craiglist的第一页。但是Linkextractor没有从其他页面获取数据。我在定义规则时做错了吗?

import scrapy
from craiglist.items import craiglistItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


class ExampleSpider(CrawlSpider):
    name = "craiglist"
    allowed_domains = ["craiglist.org"]
    start_urls = (
        'http://sfbay.craigslist.org/search/npo',
    )
    rules = [
         Rule(LinkExtractor(restrict_xpaths='//a[@class="button next"]'),     callback='parse', follow= True)
    ]

    def parse(self, response):
        titles = response.selector.xpath('//*[@id="sortable-results"]/ul/li/p')
        items = []
        for title in titles:
            item = craiglistItem()
            item["title"] = title.select("a/text()").extract()
            item["link"] = title.select("a/@href").extract()
            items.append(item)
        return items

1 个答案:

答案 0 :(得分:0)

我修改了代码,现在工作正常。以下是工作代码。

import scrapy
from craiglist.items import craiglistItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request


class ExampleSpider(CrawlSpider):
    name = "craiglist"
    allowed_domains = ["craigslist.org"]
    start_urls = (
        'http://sfbay.craigslist.org/search/npo',
    )
    rules = [
        Rule(LinkExtractor(restrict_xpaths='//a[@class="button next"]'), callback="parse_items", follow= True),
    ]

    def parse_start_url(self,response):
        request=Request("http://sfbay.craigslist.org/search/npo", callback=self.parse_items)
        return request

    def parse_items(self, response):
        titles = response.selector.xpath('//*[@id="sortable-results"]/ul/li/p')
        items = []
        for title in titles:
            item = craiglistItem()
            item["title"] = title.select("a/text()").extract()
            item["link"] = title.select("a/@href").extract()
            #item["link"] = response.url
            items.append(item)
        return items