我正在尝试从给定的网站下载数千个PDF。但是,出于某种原因,它甚至无法下载100个PDF文件。我不知道为什么。这是代码:
#!/usr/bin/env python
import time
from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from xhtml2pdf import pisa
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import*
from PyQt4.QtWebKit import *
class Foo(QWidget):
def __init__(self, parent=None):
super(Foo, self).__init__(parent)
self.count = -1
text_file = open("input.txt", "r")
self.params = text_file.read().split('\n')
self.url = 'http://www.asdfasdfasdf.com/Property.aspx?mode=details&pin={0}'
self.gridLayout = QGridLayout(self)
#self.tabWidget = QTabWidget(self)
#self.gridLayout.addWidget(self.tabWidget, 0, 0, 1, 1)
self.mapper = QSignalMapper(self)
self.mapper.mapped.connect(self.on_mapper_mapped)
for i in range(100):
grabber = QWebView()
grabber.loadFinished.connect(self.mapper.map)
self.mapper.setMapping(grabber, i)
#self.tabWidget.addTab(grabber, "opener {0}".format(str(i)))
grabber.loadFinished.emit(True)
@pyqtSlot(int)
def on_mapper_mapped(self, gNumber):
self.count += 1
if self.count < len(self.params):
#gParam = self.params[self.count]
gParam = self.params[self.count]
opener = self.mapper.mapping(gNumber)
opener.load(QUrl(self.url.format(gParam)))
printer = QPrinter()
#setting format
printer.setPageSize(QPrinter.A4)
printer.setOutputFormat(QPrinter.PdfFormat)
#export file as c:\tem_pdf.pdf
PIDString = gParam[:2] + '-' + gParam[2:4] + '-' + gParam[4:7] + '-' + gParam[7:10] + '-' + gParam[10:14]
printer.setOutputFileName(PIDString + '.pdf')
def convertIt():
opener.print_(printer)
QObject.connect(opener, SIGNAL("loadFinished(bool)"), convertIt)
print str(self.count) + ' of ' + str(len(self.params))
if __name__ == "__main__":
import sys
app = QApplication(sys.argv)
main = Foo()
#main.show()
app.exec_()
sys.exit
理想情况下,我想添加一个页脚,但如果我尝试这样做,它就会出错。 Input.txt有100个数字(仅用于测试,我需要它在85000上工作)。它适用于像5或10这样的真实小数字,但不会做100.QwebView实例有限制吗?我应该管理它以使其有效吗?
当我为100 PDF运行此代码时,它将打印出20个PDFS。如果我将范围更改为小于100的范围,它将具有重复的PDF,但肯定超过20。
另外,我收到此错误:
qpainter :: begin:return false
答案 0 :(得分:1)
我认为既然您正在尝试制作“PyQt应用程序”,那么首先应该使用它提供的功能。
请尝试查看QNetworkAccessMAnager以控制您的所有请求及其他所有请求,QNetworkRequest,QNetworkReply。
它甚至会照顾你的线程,并根据需要保持并行性。它在下载时不会冻结您的应用。
仔细研究我的这段代码。它并没有完全符合您的要求,但对于您需要完成所有设置的内容,它是一个非常好的过滤示例。
# Subclass QNetworkAccessManager Here
from PyQt5.QtCore import QByteArray
from PyQt5.QtCore import QFile, pyqtSlot
from PyQt5.QtCore import QIODevice
from PyQt5.QtCore import QUrl
from PyQt5.QtCore import pyqtSignal
from PyQt5.QtNetwork import QNetworkAccessManager
from PyQt5.QtNetwork import QNetworkRequest
class NetworkAccessManager(QNetworkAccessManager):
signal_add_image = pyqtSignal()
dialog = None
download_finished = False
message_buffer = None
reply = None
def __init__(self):
QNetworkAccessManager.__init__(self)
self.reply = self.get(QNetworkRequest(QUrl("")))
# Save image data in QByteArray buffer to the disk (google_image_logo.png
# in the same directory)
@pyqtSlot()
def slot_finished(self):
image_file = QFile("resources/browser_images/image_required_browser")
if image_file.open(QIODevice.ReadWrite):
image_file.write(self.message_buffer)
image_file.close()
self.signal_add_image.emit()
# QMessageBox.information(None, "Hello!", "File has been saved!")
else:
pass
# QMessageBox.critical(None, "Hello!", "Error saving file!")
self.download_finished = True
self.dialog.close()
# Append current data to the buffer every time readyRead() signal is
# emitted
@pyqtSlot()
def slot_read_data(self):
self.message_buffer += self.reply.readAll()
def request_image(self, url, progress_bar, dialog):
self.reply.deleteLater()
self.download_finished = False
self.dialog = dialog
self.message_buffer = QByteArray()
url = QUrl(url)
req = QNetworkRequest(url)
req.setRawHeader(b'User-Agent',
b'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36')
self.reply = self.get(req)
self.reply.readyRead.connect(self.slot_read_data)
self.reply.finished.connect(self.slot_finished)
self.reply.downloadProgress.connect(progress_bar)
def get_reply(self):
return self.reply
def done(self):
return self.download_finished
def set_reply(self, reply):
self.reply = reply
def del_reply(self):
self.reply.deleteLater()
NETWORK_ACCESS_MANAGER = NetworkAccessManager()
希望它带给你一些亮点“)