Scrapy - 每个项目抓取4个级别的页面,不能先深入

时间:2017-09-26 00:50:05

标签: python web-scraping scrapy

我想抓取township directory of China。该网站分为4个级别,分别为省页面,城市页面,县页面和乡镇页面。例如,在省页面上,列出了所有省份。如果我们点击一​​个省的链接,则会将我们带到城市页面,并显示该省的城市列表。

我希望我的每个项目都是一个乡镇。它包括town_name,town_id(gbcode)和相应的county_name,city_name,prov_name。所以当蜘蛛进入乡镇页面时,它应该沿途收集信息。但是,我目前使用for循环的方法似乎不起作用。 prov_name没有问题。但是城市和县名大多不正确,它们始终是相应页面列表中的最后一个城市/县。我认为问题是蜘蛛不够深入,只能在循环结束时转到parse_county请求。但是,在设置中更改深度优先级并不能解决问题。

---------- Sample Result --------
town_name, year, gbcode, city, province, county  
建国门街道办事处,2016,110101008000,市辖区,北京市,延庆区
东直门街道办事处,2016,110101009000,市辖区,北京市,延庆区
和平里街道办事处,2016,110101010000,市辖区,北京市,延庆区
前门街道办事处,2016,110101011000,市辖区,北京市,延庆区
崇文门外街道办事处,2016,110101012000,市辖区,北京市,延庆区



import scrapy
import re
from scrapy.spiders import Spider
from admincode.items import AdmincodeItem

class StatsSpider(Spider):
    name = 'stats'
    allowed_domains = ['stats.gov.cn']
    start_urls = [
        'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]

    def parse(self, response):
        for item in self.parse_provincetr(response, response.selector.css(".provincetr")):
            yield item

    def get_text_href(self, td):
        if not td.xpath('a'):
            return td.xpath('text()').extract()[0], None
        else:
            return td.xpath('a/text()').extract()[0], td.xpath('a/@href').extract()[0]

    def parse_provincetr(self, response, trs):
        year_pattern = re.compile('(tjyqhdmhcxhfdm/)([0-9][0-9][0-9][0-9])')
        year = year_pattern.search(response.url).group(2)
        for td in trs.xpath('td'):
            scraped = {}
            scraped['year'] = year
            scraped['prov_name'], href = self.get_text_href(td)
            url = response.urljoin(href)
            yield scrapy.Request(url, callback=self.parse_citytr,
                                meta={'scraped': scraped})

    def parse_2td(self, response, trs, var_name, nextparse):
        for tr in trs:
            scraped = response.meta['scraped']
            scraped[var_name], href = self.get_text_href(tr.xpath('td')[1])
            if nextparse:
                url = response.urljoin(href)
                yield scrapy.Request(url, callback=nextparse, meta={'scraped': scraped})
            else:
                item = AdmincodeItem()
                item['year'] = scraped['year']
                item['prov_name'] = scraped['prov_name']
                item['city_name'] = scraped['city_name']
                item['county_name'] = scraped['county_name']
                item['town_name'] = scraped['town_name']
                item['gbcode'], href = self.get_text_href(
                    tr.xpath('td')[0])
                yield item

    def parse_citytr(self, response):
        for city in self.parse_2td(response, response.selector.css(".citytr"), 'city_name', self.parse_countytr):
            yield city

    def parse_countytr(self, response):
        for county in self.parse_2td(response, response.selector.css(".countytr"), 'county_name', self.parse_towntr):
            yield county

    def parse_towntr(self, response):
        for town in self.parse_2td(response, response.selector.css(".towntr"), 'town_name', None):
            yield town

1 个答案:

答案 0 :(得分:0)

我认为你让事情变得有点复杂。这是一个简单的刮刀,您需要做的是使用meta将信息从一个页面传递到另一个页面。由于meta是内存中的字典,我们需要确保为后续项目创建信息的副本。为此,我们使用copy.deepcopy。这将确保在产生项目之前不会覆盖数据

下面是执行该操作的刮刀

class StatsSpider(Spider):
    name = 'stats'
    allowed_domains = ['stats.gov.cn']
    start_urls = [
        'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]

    def parse(self, response):
        for item in response.css(".provincetr a"):
            name = item.xpath("./text()").extract_first().strip()
            link = item.xpath("./@href").extract_first().strip()
            yield response.follow(link, callback=self.parse_province, meta={'item':{'province':name}})

    def parse_province(self, response):
        meta = response.meta['item']

        for cityrow in response.css(".citytr"):
            city_link = cityrow.xpath("./td[2]/a/@href").extract_first()
            city_name = cityrow.xpath("./td[2]/a/text()").extract_first()
            city_code = cityrow.xpath("./td[1]/a/text()").extract_first()

            meta_new = deepcopy(meta)

            meta_new['city_name'] = city_name
            meta_new['city_code'] = city_code

            yield response.follow(city_link, callback=self.parse_city, meta = {'item':meta_new})

    def parse_city(self, response):

        meta = response.meta['item']

        for countyrow in response.css(".countytr"):
            county_link = countyrow.xpath("./td[2]/a/@href").extract_first()
            county_name = countyrow.xpath("./td[2]/a/text()").extract_first()
            county_code = countyrow.xpath("./td[1]/a/text()").extract_first()

            meta_new = deepcopy(meta)

            meta_new['county_name'] = county_name
            meta_new['county_code'] = county_code

            yield response.follow(county_link, callback=self.parse_county, meta = {"item": meta_new})

    def parse_county(self, response):

        meta = response.meta['item']

        for townrow in response.css(".towntr"):
            town_link = townrow.xpath("./td[2]/a/@href").extract_first()
            town_name = townrow.xpath("./td[2]/a/text()").extract_first()
            town_code = townrow.xpath("./td[1]/a/text()").extract_first()

            meta_new = deepcopy(meta)

            meta_new['town_name'] = town_name
            meta_new['town_code'] = town_code

            yield meta_new