Scrapy如何在二级或嵌套分页中爬行分页

时间:2017-08-16 19:08:53

标签: python scrapy scrapy-spider

我正在尝试抓取Catalog的分页列表,其中正常

但是对于每个 Catalog DataSet分页列表,但只有第一页出现在结果中即可。我试图得到一个看起来像这样的结果,但所有24节点都应该对应于24数据集,每个节点跨越6个项目的页面。

[{'data_sets_count': 24,
  'description': 'The catalog contains data regarding various indicators of '
                 'HMIS like Health, Abortions, Immunisation, AEFI, Adolescent, '
                 'Bite, Sting, Disease, Diarrhoeal, Hypertension, HIV, AIDS, '
                 'Malaria, Neurological, Stroke, Fever, Respiratory, '
                 'Infection, suicide, Trauma, Accident, Burn, Tuberculosis, '
                 'VHND, ASHA, JSY, CHC, PHC, SDH, DH, Hospital.',
  'last_updated': '11/08/17',
  'ministry_department': 'Ministry of Health and Family Welfare, Department of '
                         'Health and Family Welfare',
  'nodes': [{'node': '3183861',
             'title': 'Item-wise report for North Goa of Goa upto '
                      'April-2014-15'},
            {'node': '3183881',
             'title': 'Item-wise report for North Goa of Goa upto May-2014-15'},
            {'node': '3183981',
             'title': 'Item-wise report for North Goa of Goa upto '
                      'October-2014-15'},
            {'node': '3184021',
             'title': 'Item-wise report for North Goa of Goa upto '
                      'December-2014-15'},
            {'node': '3184061',
             'title': 'Item-wise report for North Goa of Goa upto '
                      'February-2014-15'},
            {'node': '3183961',
             'title': 'Item-wise report for North Goa of Goa upto '
                      'September-2014-15'}],
  'state_department': None,
  'title': 'HMIS sub district level item-wise monthly report of Goa',
  'url': '/catalog/hmis-sub-district-level-item-wise-monthly-report-goa'}]
import scrapy
class Category(scrapy.Item):
    title = scrapy.Field()
    url = scrapy.Field()
    ministry_department = scrapy.Field()
    description = scrapy.Field()
    state_department = scrapy.Field()
    last_updated = scrapy.Field()
    data_sets_count = scrapy.Field()
    data_sets = scrapy.Field()
    item = scrapy.Field()
    nodes = scrapy.Field()

class CatalogSpider(scrapy.Spider):
    name = 'catalogspider'
    start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1']

    def parse(self, response):
        for catalog in response.css('.view-catalogs > div > .views-row-6'):
            category = Category()
            category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first()
            category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first()
            category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first()
            category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first()
            category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first()
            category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first()
            category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0])
            category['nodes'] = []
            request = scrapy.Request(response.urljoin(category['url']), callback=self.parseDataSets)
            request.meta['item'] = category
            yield request

        for next_page in response.css('li.pager-next > a'):
            yield response.follow(next_page, self.parse)


    def parseDataSets(self, response):
        item = response.meta['item']

        for dataset in response.css('.view-resource-detail-popup > div > .views-row'):
            item['nodes'].append({
                'node' : dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0],
                'title' : dataset.css('.views-field-title .field-content .title-content::text').extract_first()
                })

        for next_page in response.css('li.pager-next'):
            print('here')
            request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parseDataSets)
            request.meta['item'] = item

        yield item

1 个答案:

答案 0 :(得分:0)

我使用下面的代码让它工作,我不确定这是正确的方法。我将DataSet添加到元变量category,并生成None,最后在最后一页时生成元变量category 即可。听起来有点hacky,但现在有效。

import scrapy
class Category(scrapy.Item):
    title = scrapy.Field()
    url = scrapy.Field()
    ministry_department = scrapy.Field()
    description = scrapy.Field()
    state_department = scrapy.Field()
    last_updated = scrapy.Field()
    data_sets_count = scrapy.Field()
    data_sets_actual_count = scrapy.Field()
    data_sets = scrapy.Field()
    item = scrapy.Field()
    nodes = scrapy.Field()

class CatalogSpider(scrapy.Spider):
    name = 'catalogspider'
    start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1']

    def parse(self, response):
        for catalog in response.css('.view-catalogs > div > .views-row-6'):
            category = Category()
            category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first()
            category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first()
            category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first()
            category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first()
            category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first()
            category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first()
            category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0])
            category['nodes'] = []
            request = scrapy.Request(response.urljoin(category['url']), callback=self.parse_data_sets)
            request.meta['category'] = category
            yield request

        #for next_page in response.css('li.pager-next > a'):
        #    yield response.follow(next_page, self.parse)


    def parse_data_sets(self, response):
        category = response.meta['category']
        datasets = response.css('.view-resource-detail-popup > div > .views-row')
        if datasets:
            for dataset in datasets:
                node = dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0]
                title = dataset.css('.views-field-title .field-content .title-content::text').extract_first()
                url = 'https://data.gov.in/node/' + node + '/download'
                category['nodes'].append({
                    'node' : node,
                    'title' : title,
                    'url' : url
                    })
                yield None
        else:
            yield category

        if len(response.css('li.pager-next').extract()) == 0:
            category['data_sets_actual_count'] = len(category['nodes'])
            yield category

        #pagination
        for next_page in response.css('li.pager-next'):
            request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parse_data_sets)
            request.meta['category'] = category
            yield request

我的一个问题是在我的命令中设置了错误的深度,后来我更改为更大的数字,在未知域中的随机问题:

 scrapy parse --spider=catalogspider -d 60 'https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'