用Scrapy建造一棵树

时间:2018-04-05 13:42:05

标签: python web-scraping scrapy

我是网络抓取的新手。

我基本上想要做的是抓住以下网站:https://www.admin.ch/opc/fr/classified-compilation/national.html,它引用了所有瑞士联邦法律。 网站的结构是树状的(你从类别开始:Etat - Peuple - Autorités,Défensecountryale等等;其中包含子类别,直到你到达法律文本的树叶)我希望得到JSON文件中的相同结构。

这是我刚才写的蜘蛛:

1

辅助函数只检查正则表达式,在我们的例子中没什么重要的(因为它们确实有效):

import scrapy
from scrapy.spiders import XMLFeedSpider

from admin_ch.helpers.helper_functions import is_article_number, is_int, is_final_article

class Spider(XMLFeedSpider):
    name = "spider_droit_interne"
    allowed_domains = ["admin.ch"]
    start_urls = [
            'https://www.admin.ch/opc/fr/classified-compilation/national.html'
            ]
    itertag = 'item'  

    def parse(self, response):
        for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
            cat_number = category.xpath('td/text()').extract_first()
            cat_name = cat_number + ' ' + category.xpath('td/a/text()').extract_first()
            url = category.xpath('td/a/@href').extract_first()
            url = response.urljoin(url)
            yield scrapy.Request(url, callback=self.parse_category, meta={'name': cat_name})
            return #Just do the 1st category as an example

    def parse_category(self, response):
        name = response.meta['name']
        tree = {}
        node = {}
        for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
            cat_number = category.xpath('td/text()').extract_first()
            cat_name = cat_number + ' ' + category.xpath('td/a/text()').extract_first()
            if is_int(cat_number) and len(cat_number) < 3:
                url = category.xpath('td/a/@href').extract_first()
                url = response.urljoin(url)
                if is_final_article(url):
                    node[cat_name] = 'leaf reached!!!!!'
                    yield node
                else:
                    #node[cat_name] = 'more nodes'
                    yield scrapy.Request(url, callback=self.get_article, meta={'name': cat_name})
        tree[name] = node
        yield tree

    def get_article(self, response):
        name = response.meta['name']
        tree = {}
        node = {}
        for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
            cat_number = category.xpath('td/text()').extract()[1]
            for element in [' ', '\n', '\r']:
                cat_number = cat_number.replace(element, '')
            if is_article_number(cat_number):
                url = category.xpath('td/a/@href').extract_first()
                url = response.urljoin(url)
                if is_final_article(url):
                    node[cat_number] = 'leaf reached!!!!!'
                else:
                    node[cat_number] = cat_number
                    #yield scrapy.Request(url, callback=self.get_article, meta={'tree': item})
                    #The content is not important for the moment, only the structure
        tree[name] = node
        yield tree

我从中得到的结果是非常错误的,你能帮我解决这个问题吗?

0 个答案:

没有答案