加速scrapy python3脚本

时间:2018-05-13 20:48:32

标签: php pdf download scrapy python-3.6

我想从尼加拉瓜国民议会this网站批量下载免费下载pdf(1843年至1900年旧报纸的副本,名为Gaceta)Python3 / {{1 (见前问题here)使用以下脚本:

Scrapy

该脚本完成其工作,从#!/usr/bin/env python3 # -*- coding: utf-8 -*- # A scrapy script to download issues of the Gaceta de Nicaragua (1843-1961) # virtualenv -p python3 envname # source envname/bin/activate # scrapy runspider gaceta_downloader.py import errno import json import os import scrapy from scrapy import FormRequest, Request pwd="/Downloads" os.chdir(pwd) # this will change directory to pwd path. print((os.getcwd())) class AsambleaSpider(scrapy.Spider): name = 'asamblea' allowed_domains = ['asamblea.gob.ni'] start_urls = ['http://digesto.asamblea.gob.ni/consultas/coleccion/'] papers = { "Diario Oficial": "28", } def parse(self, response): for key, value in list(self.papers.items()): yield FormRequest(url='http://digesto.asamblea.gob.ni/consultas/util/ws/proxy.php', headers= { 'X-Requested-With': 'XMLHttpRequest' }, formdata= { 'hddQueryType': 'initgetRdds', 'cole': value } , meta={'paper': key}, callback=self.parse_rdds ) pass def parse_rdds(self, response): data = json.loads(response.body_as_unicode()) for r in data["rdds"]: r['paper'] = response.meta['paper'] rddid = r['rddid'] yield Request("http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=" + rddid, callback=self.download_pdf, meta=r) def download_pdf(self, response): filename = "{paper}/{anio}/".format(**response.meta) + "{titulo}-{fecPublica}.pdf".format(**response.meta).replace("/", "_") if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'wb') as f: f.write(response.body) 文件获取直接链接并随后下载PDF,但有两件事仍然困扰着我:

  1. 我希望能够设置我想下载的Gacetas的时间范围,i。即所有问题(可用)在01/01/1844到01/01/1900之间。我试图弄清楚自己无济于事,因为我是一名编程新手。
  2. 我想加速脚本。也许用php?至于现在,即使我没有测量它,它仍然感觉执行速度很慢。

1 个答案:

答案 0 :(得分:1)

免责声明:我没有测试脚本,因为scrapy需要Microsoft Visual C ++ 14.0,下载和安装需要一段时间:(

这是一个更新的脚本,我将日期范围添加为startend,并修改了parse_rdds方法,仅在时间范围内下载文件。

至于优化它,scrapy是一个非阻塞的lib,据我所知它应该能够并行下载几个文件。请记住,您正在下载看起来很多文件的内容,这自然需要一段时间。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# A scrapy script to download issues of the Gaceta de Nicaragua (1843-1961)

# virtualenv -p python3 envname
# source envname/bin/activate
# scrapy runspider gaceta_downloader.py

import errno
import json
import os
from datetime import datetime

import scrapy
from scrapy import FormRequest, Request

pwd="/Downloads"
os.chdir(pwd) # this will change directory to pwd path.
print((os.getcwd()))


# date range, format DD/MM/YYYY
start = '16/01/1844'
end = '01/01/1900'

date_format = '%d/%m/%Y'
start = datetime.strptime(start, date_format)
end = datetime.strptime(end, date_format)


class AsambleaSpider(scrapy.Spider):
    name = 'asamblea'
    allowed_domains = ['asamblea.gob.ni']
    start_urls = ['http://digesto.asamblea.gob.ni/consultas/coleccion/']

    papers = {
        "Diario Oficial": "28",
    }

    def parse(self, response):

        for key, value in list(self.papers.items()):
            yield FormRequest(url='http://digesto.asamblea.gob.ni/consultas/util/ws/proxy.php',
                  headers= {
                      'X-Requested-With': 'XMLHttpRequest'
                  }, formdata= {
                        'hddQueryType': 'initgetRdds',
                        'cole': value
                    }
                    , meta={'paper': key},
                    callback=self.parse_rdds
                )
        pass

    def parse_rdds(self, response):
        data = json.loads(response.body_as_unicode())
        for r in data["rdds"]:
            if not r['fecPublica']:
                continue

            r_date = datetime.strptime(r['fecPublica'], date_format)

            if start <= r_date <= end:
                r['paper'] = response.meta['paper']
                rddid = r['rddid']
                yield Request("http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=" + rddid,
                              callback=self.download_pdf, meta=r)

    def download_pdf(self, response):
       filename = "{paper}/{anio}/".format(**response.meta) + "{titulo}-{fecPublica}.pdf".format(**response.meta).replace("/", "_")
       if not os.path.exists(os.path.dirname(filename)):
           try:
               os.makedirs(os.path.dirname(filename))
           except OSError as exc:  # guard against race condition
               if exc.errno != errno.EEXIST:
                   raise

       with open(filename, 'wb') as f:
           f.write(response.body)