制作'start_urls'变量

时间:2016-07-17 17:13:12

标签: python scrapy

以下固定start_urls的蜘蛛可以使用:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from funda.items import FundaItem

class PropertyLinksSimpleSpider(CrawlSpider):

    name = "property_links_simple"
    allowed_domains = ["funda.nl"]

    # def __init__(self, place='amsterdam', page='1'):
    #     self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page)]
    #     self.le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % self.start_urls[0])

    start_urls = ["http://www.funda.nl/koop/amsterdam/"]
    le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % start_urls[0])
    # rules = (Rule(le1, callback='parse_item'), )

    def parse(self, response):
        links = self.le1.extract_links(response)
        for link in links:
            if link.url.count('/') == 6 and link.url.endswith('/'):
                item = FundaItem()
                item['url'] = link.url
                yield item

当我使用命令scrapy crawl property_links_simple -o property_links.json的Feed输出运行它时,生成的文件包含预期的链接:

[
{"url": "http://www.funda.nl/koop/amsterdam/huis-49708477-paul-schuitemahof-27/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49826458-buiksloterdijk-270/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49818887-markiespad-19/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49801910-claus-van-amsbergstraat-86/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49801593-jf-berghoefplantsoen-2/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49800159-breezandpad-8/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49805292-nieuwendammerdijk-21/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49890140-talbotstraat-9/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49879212-henri-berssenbruggehof-15/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49728947-emmy-andriessestraat-374/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49713458-jan-vrijmanstraat-29/"}
]

但是,我希望能够将不同的start_urls传递给蜘蛛,例如http://www.funda.nl/koop/rotterdam/p2/。为此,我尝试按如下方式进行调整:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from funda.items import FundaItem

class PropertyLinksSimpleSpider(CrawlSpider):

    name = "property_links_simple"
    allowed_domains = ["funda.nl"]

    def __init__(self, place='amsterdam', page='1'):
        self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page)]
        self.le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % self.start_urls[0])

    # start_urls = ["http://www.funda.nl/koop/amsterdam/"]
    # le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % start_urls[0])
    # rules = (Rule(le1, callback='parse_item'), )

    def parse(self, response):
        links = self.le1.extract_links(response)
        for link in links:
            if link.url.count('/') == 6 and link.url.endswith('/'):
                item = FundaItem()
                item['url'] = link.url
                yield item

但是,如果我使用命令scrapy crawl property_links_simple -a place=amsterdam -a page=1 -o property_links2.json运行它,我会得到一个空的.json文件:

[
[

为什么蜘蛛不再产生任何输出?

1 个答案:

答案 0 :(得分:1)

原来这是一个简单的人为错误:在第二个例子中,start_urls[0]不再相同。我添加了self.base_url以使其再次相同:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from funda.items import FundaItem

class PropertyLinksSimpleSpider(CrawlSpider):

    name = "property_links_simple"
    allowed_domains = ["funda.nl"]

    def __init__(self, place='amsterdam', page='1'):
        self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page)]
        self.base_url = "http://www.funda.nl/koop/%s/" % place
        self.le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % self.base_url)

    def parse(self, response):
        links = self.le1.extract_links(response)
        for link in links:
            if link.url.count('/') == 6 and link.url.endswith('/'):
                item = FundaItem()
                item['url'] = link.url
                yield item

这使蜘蛛产生了所需的.json文件。