在scrapy中连接三个功能

时间:2018-02-07 19:23:29

标签: python web-scraping scrapy

我正在写一个scrapy蜘蛛,其中有3个功能。第一个功能是收集链接,并且从每个链接第二个函数再次提取在第一个函数中提取链接的不同页面上的更多链接,第三个函数从第二个函数中提取的链接中提取数据。请在这方面帮助我。

# -*- coding: utf-8 -*-
import scrapy
#from scrapy.http import Request

class ScotlandSpider(scrapy.Spider):
    name = 'scotland'
    allowed_domains = ['www.whoownsscotland.org.uk']
    start_urls = ['http://www.whoownsscotland.org.uk/search.php']

    def parse(self, response):
        links = response.xpath('//p/a/@href').extract()
        for link in links:
            absoulute_url = response.urljoin(link)
            yield scrapy.Request(absoulute_url , callback=self.parse_links)


    def parse_links(self , response):
        cities = response.xpath('//*[@id="layout-right"]/table/tr/td/p/a/@href').extract()
        for city in cities:
            absoulute_url_new = response.urljoin(city)
            yield scrapy.Request(absoulute_url_new , callback=self.parse_cities)

    def parse_cities(self , response):
        record = response.xpath('//*[@id="layout-left"]/table/tr')

        estate =  record[0].xpath('.//th/text()').extract()
        courty =  record[1].xpath('.//th/text()').extract()
        grid_ref =  record[2].xpath('.//th/text()').extract()
        acreage =  record[3].xpath('.//th/text()').extract()
        os_15 =  record[4].xpath('.//th/text()').extract()
        owner  = record[5].xpath('.//th/text()').extract() 
        owner_address = record[6].xpath('.//th/text()').extract()
        property_address = record[7].xpath('.//th/text()').extract()
        website  = record[8].xpath('.//th/text()').extract()
        further_info = record[9].xpath('.//td//text()').extract()
        contacts = record[10].xpath('.//td//text()').extract()
        regsiters_sheet = record[11].xpath('.//td//text()').extract()
        regsiters_certificate = record[12].xpath('.//td//text()').extract()
        currency_of_data = record[13].xpath('.//td//text()').extract()



        yield{
        "Estate" : estate,
        "County" : courty,
        "Grid Reference" : grid_ref,
        "Acreage" : acreage,
        "OS 1:50k Sheet" : os_15,
        "Owner" : owner,
        "Owner Address" : owner_address,
        "Property Address" : property_address,
        "Website" : website,
        "Further Information" : further_info,
        "Contacts" : contacts,
        "Registers of Scotland Sasines Search Sheet No" : regsiters_sheet,
        "Registers of Scotland Land Certificate No" : regsiters_certificate ,
        "Currency of Data" : currency_of_data
        }   

1 个答案:

答案 0 :(得分:0)

以下是有关如何在各种功能之间传递数据的完整代码,请注意meta中的Request()

# -*- coding: utf-8 -*-
import scrapy
#from scrapy.http import Request

class ScotlandSpider(scrapy.Spider):
    name = 'scotland'
    allowed_domains = ['www.whoownsscotland.org.uk']
    start_urls = ['http://www.whoownsscotland.org.uk/search.php']

    def parse(self, response):
        links = response.xpath('//p/a/@href').extract()
        for link in links:
            absoulute_url = response.urljoin(link)

            data_to_pass = {"absoulute_url": absoulute_url}

            yield scrapy.Request(absoulute_url , callback=self.parse_links, meta = {'data': data_to_pass})


    def parse_links(self , response):

        passed_data = response.meta['data']

        cities = response.xpath('//*[@id="layout-right"]/table/tr/td/p/a/@href').extract()
        for city in cities:

            passed_data['city'] = city

            absoulute_url_new = response.urljoin(city)
            yield scrapy.Request(absoulute_url_new , callback=self.parse_cities, meta = {'data': data_to_pass.copy()})

    def parse_cities(self , response):

        passed_data = response.meta['data']

        record = response.xpath('//*[@id="layout-left"]/table/tr')

        estate =  record[0].xpath('.//th/text()').extract()
        courty =  record[1].xpath('.//th/text()').extract()
        grid_ref =  record[2].xpath('.//th/text()').extract()
        acreage =  record[3].xpath('.//th/text()').extract()
        os_15 =  record[4].xpath('.//th/text()').extract()
        owner  = record[5].xpath('.//th/text()').extract() 
        owner_address = record[6].xpath('.//th/text()').extract()
        property_address = record[7].xpath('.//th/text()').extract()
        website  = record[8].xpath('.//th/text()').extract()
        further_info = record[9].xpath('.//td//text()').extract()
        contacts = record[10].xpath('.//td//text()').extract()
        regsiters_sheet = record[11].xpath('.//td//text()').extract()
        regsiters_certificate = record[12].xpath('.//td//text()').extract()
        currency_of_data = record[13].xpath('.//td//text()').extract()

        passed_data["Estate"] = estate
        passed_data["County"] = courty
        passed_data["Grid Reference"] = grid_ref
        passed_data["Acreage"] = acreage
        passed_data["OS 1:50k Sheet"] = os_15
        passed_data["Owner"] = owner
        passed_data["Owner Address"] = owner_address
        passed_data["Property Address"] = property_address
        passed_data["Website"] = website
        passed_data["Further Information"] = further_info
        passed_data["Contacts"] = contacts
        passed_data["Registers of Scotland Sasines Search Sheet No"] = regsiters_sheet
        passed_data["Registers of Scotland Land Certificate No"] = regsiters_certificate
        passed_data["Currency of Data"] = currency_of_data