使用Pyqt4从URL下载数千个PDF

时间:2017-01-15 20:30:23

标签: python pdf pyqt4

我正在尝试从给定的网站下载数千个PDF。但是,出于某种原因,它甚至无法下载100个PDF文件。我不知道为什么。这是代码:

#!/usr/bin/env python
import time
from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from xhtml2pdf import pisa
import sys 
from PyQt4.QtCore import *
from PyQt4.QtGui import*
from PyQt4.QtWebKit import *

class Foo(QWidget):
    def __init__(self, parent=None):
        super(Foo, self).__init__(parent)    

        self.count  = -1
        text_file = open("input.txt", "r")
        self.params = text_file.read().split('\n')
        self.url    = 'http://www.asdfasdfasdf.com/Property.aspx?mode=details&pin={0}'

        self.gridLayout = QGridLayout(self)
        #self.tabWidget = QTabWidget(self)
        #self.gridLayout.addWidget(self.tabWidget, 0, 0, 1, 1)

        self.mapper = QSignalMapper(self)
        self.mapper.mapped.connect(self.on_mapper_mapped)

        for i in range(100):
            grabber = QWebView()
            grabber.loadFinished.connect(self.mapper.map)

            self.mapper.setMapping(grabber, i)
            #self.tabWidget.addTab(grabber, "opener {0}".format(str(i)))

            grabber.loadFinished.emit(True)

    @pyqtSlot(int)
    def on_mapper_mapped(self, gNumber):
        self.count += 1
        if self.count < len(self.params):
            #gParam = self.params[self.count]
            gParam = self.params[self.count]

            opener = self.mapper.mapping(gNumber)
            opener.load(QUrl(self.url.format(gParam)))
            printer = QPrinter()
            #setting format
            printer.setPageSize(QPrinter.A4)
            printer.setOutputFormat(QPrinter.PdfFormat)
            #export file as c:\tem_pdf.pdf
            PIDString = gParam[:2] + '-' + gParam[2:4] + '-' + gParam[4:7] + '-' + gParam[7:10] + '-' + gParam[10:14]
            printer.setOutputFileName(PIDString + '.pdf')
            def convertIt():
                opener.print_(printer)

            QObject.connect(opener, SIGNAL("loadFinished(bool)"), convertIt)
            print str(self.count) + ' of ' + str(len(self.params))



if __name__ == "__main__":
    import  sys

    app = QApplication(sys.argv)
    main = Foo()
    #main.show()
    app.exec_()
    sys.exit

理想情况下,我想添加一个页脚,但如果我尝试这样做,它就会出错。 Input.txt有100个数字(仅用于测试,我需要它在85000上工作)。它适用于像5或10这样的真实小数字,但不会做100.QwebView实例有限制吗?我应该管理它以使其有效吗?

当我为100 PDF运行此代码时,它将打印出20个PDFS。如果我将范围更改为小于100的范围,它将具有重复的PDF,但肯定超过20。

另外,我收到此错误:

  

qpainter :: begin:return false

1 个答案:

答案 0 :(得分:1)

我认为既然您正在尝试制作“PyQt应用程序”,那么首先应该使用它提供的功能。

请尝试查看QNetworkAccessMAnager以控制您的所有请求及其他所有请求,QNetworkRequestQNetworkReply

它甚至会照顾你的线程,并根据需要保持并行性。它在下载时不会冻结您的应用。

仔细研究我的这段代码。它并没有完全符合您的要求,但对于您需要完成所有设置的内容,它是一个非常好的过滤示例。

# Subclass QNetworkAccessManager Here
from PyQt5.QtCore import QByteArray
from PyQt5.QtCore import QFile, pyqtSlot
from PyQt5.QtCore import QIODevice
from PyQt5.QtCore import QUrl
from PyQt5.QtCore import pyqtSignal
from PyQt5.QtNetwork import QNetworkAccessManager
from PyQt5.QtNetwork import QNetworkRequest


class NetworkAccessManager(QNetworkAccessManager):

    signal_add_image = pyqtSignal()
    dialog = None
    download_finished = False
    message_buffer = None
    reply = None

    def __init__(self):
        QNetworkAccessManager.__init__(self)
        self.reply = self.get(QNetworkRequest(QUrl("")))

    # Save image data in QByteArray buffer to the disk (google_image_logo.png
    # in the same directory)
    @pyqtSlot()
    def slot_finished(self):
        image_file = QFile("resources/browser_images/image_required_browser")
        if image_file.open(QIODevice.ReadWrite):
            image_file.write(self.message_buffer)
            image_file.close()
            self.signal_add_image.emit()
            # QMessageBox.information(None, "Hello!", "File has been saved!")
        else:
            pass
            # QMessageBox.critical(None, "Hello!", "Error saving file!")
        self.download_finished = True
        self.dialog.close()

    # Append current data to the buffer every time readyRead() signal is
    # emitted
    @pyqtSlot()
    def slot_read_data(self):
        self.message_buffer += self.reply.readAll()

    def request_image(self, url, progress_bar, dialog):
        self.reply.deleteLater()
        self.download_finished = False
        self.dialog = dialog
        self.message_buffer = QByteArray()
        url = QUrl(url)
        req = QNetworkRequest(url)
        req.setRawHeader(b'User-Agent',
                                    b'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36')
        self.reply = self.get(req)
        self.reply.readyRead.connect(self.slot_read_data)
        self.reply.finished.connect(self.slot_finished)
        self.reply.downloadProgress.connect(progress_bar)

    def get_reply(self):
        return self.reply

    def done(self):
        return self.download_finished

    def set_reply(self, reply):
        self.reply = reply

    def del_reply(self):
        self.reply.deleteLater()

NETWORK_ACCESS_MANAGER = NetworkAccessManager()

希望它带给你一些亮点“)