PyQt使用gzip编码加载网页

时间:2018-06-10 08:21:32

标签: javascript python web-scraping pyqt gzip

我想抓一些网站。为了做到这一点,我选择了PyQt4,因为QtWebKit模块是基于WebKit构建的,而且我还有可能做到无头,并且我需要安装的模块少得多。一切都按预期工作,但如果我想要请求网站和所有资源进行压缩("接受编码"," gzip")我遇到了麻烦。

加载html页面不是问题,但是在尝试加载.js或.css文件时会出现问题(因为.css并不重要,因为我将在无头模式下进行大部分报废)。但是对于.js文件很重要,因为内容未加载。主要问题是我想以压缩格式接收.js文件(如果网站支持),但我不知道如何将它们添加为主网页的资源/脚本。如果我不处理压缩,PyQt不会自动执行,因此.js脚本以.gzip格式加载而不被解释。

下面是一个示例,并使用随机网站以gzip格式发送hmtl和资源。使用PyQt的检查器实用程序(右键单击 - > Inspect),我们可以在选项卡" Resources - >中看到。框架 - >脚本/样式表"这些是压缩的而不是解释的。

在解压缩后,是否可以将.js / .css文件添加为网页资源?我该怎么办?我尝试了很多事情,但我没有想法。

如果问题出在PyQt4上,我可以接受它并将我的代码迁移到其他地方。

import sys
import gzip
import re

from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import QNetworkRequest, QNetworkAccessManager, QNetworkReply


class Window(QWidget):
    def __init__(self, url):
        super(Window, self).__init__()
        self.view = QWebView(self)
        self.page = self.view.page()
        self.main_frame = self.page.mainFrame()
        self.view.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
        self.view.settings().setAttribute(QWebSettings.PluginsEnabled, True)
        self.view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True)
        self.view.settings().setAttribute(QWebSettings.LocalContentCanAccessFileUrls, True)
        self.url = url
        self.manager = MyNetworkAccessManager(url, self.view)
        self.page.setNetworkAccessManager(self.manager)

        # using this method to load the page will bypass my implementation to uncompress and just display
        # the compress content - which is just garbage
        # self.view.load(QUrl(url))

        self.main_frame.loadFinished.connect(self.loadFinished)

        self.setupInspector()

        self.splitter = QSplitter(self)
        self.splitter.setOrientation(Qt.Vertical)

        layout = QVBoxLayout(self)
        layout.setMargin(0)
        layout.addWidget(self.splitter)

        self.splitter.addWidget(self.view)
        self.splitter.addWidget(self.webInspector)

    def loadFinished(self, *args, **kwargs):
        print('Size: {}'.format(self.page.bytesReceived()))

    def setupInspector(self):
        page = self.view.page()
        page.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
        self.webInspector = QWebInspector(self)
        self.webInspector.setPage(page)

        shortcut = QShortcut(self)
        shortcut.setKey(Qt.Key_F12)
        shortcut.activated.connect(self.toggleInspector)
        self.webInspector.setVisible(False)

    def toggleInspector(self):
        self.webInspector.setVisible(not self.webInspector.isVisible())


match_all = re.compile('^((?!safaribooksonline).)*$|.+(image|data|img|twitter|vpn|facebook).+')

class MyNetworkAccessManager(QNetworkAccessManager):
    def __init__(self, url, view):
        QNetworkAccessManager.__init__(self)
        self.url = QUrl(url)
        self.request = QNetworkRequest(self.url)
        self.view = view
        # making the request from here will use my implementation to decompress the data
        self.get(self.request)
        self.finished.connect(self.readFinishGzip)

    def createRequest(self, operation, request, data=None):
        # ignore links of no interest
        if not match_all.match(request.url().toString()):
            print("mymanager handles ", request.url())
            request.setRawHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
            request.setRawHeader('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0')
            request.setRawHeader("Accept-Language", "en-GB,en;q=0.5")
            request.setRawHeader("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7")
            request.setRawHeader("Content-Type", "application/x-www-form-urlencoded")
            request.setRawHeader("Accept-Encoding", "gzip")
            request.setRawHeader("Connection", "keep-alive")
            # here the reply should be received, but not a lot of things can be done
            # first of all here you can't know when the reply is ready, unless you use a specific wait and retry read
            # but then it is too late, the page javaScript will still not be loaded
            # it is also mandatory to return a QNetworkReply
            return QNetworkAccessManager.createRequest(self, operation, request, data)
        return QNetworkAccessManager.createRequest(self, QNetworkAccessManager.GetOperation, QNetworkRequest(QUrl()), None)

    def readFinishGzip(self, reply):
        url = reply.url()
        content = reply.rawHeader(b'Content-Type').data().decode('UTF-8').split(';')[0]
        encode = reply.rawHeader(b'Content-Encoding')
        data = reply.readAll()
        # print('{} - {} - {}'.format(url, content, encode))
        if encode.data().decode('UTF-8') == 'gzip':
            print('Gzip page: checking content.')
            if content == 'text/javascript':
                if data == b'':
                    # if the page is set using "setHtml" method with the "baseUrl" keyword then the content
                    # of js scripts is empty because most probably is set automatically by PyQt and after
                    # a "readAll" the reply is empty. Because of this I am making a second request in order
                    # to get the data. But even so I don't know how to add it as resource/scripts to the mainFrame
                    pass
                    # self.get(QNetworkRequest(reply.url())).readAll().data()
                else:
                    js = gzip.decompress(data)
                    # Evaluating the code as javaScript doesn't do anything and it doesn't set it as decompressed
                    # resource for the page. using "setContent" just overrides the current page content and displays
                    # the javaScript as plain text
                    # What can be done to add the javaScript as source???
                    # print(self.view.page().mainFrame().evaluateJavaScript(js.decode('UTF-8')))
                    # self.view.page().mainFrame().setContent(js, 'text/javascript')
            elif content == 'text/css':
                if data == b'':
                    pass
                    # the same scenario as in case of javaScript content
                    # self.get(QNetworkRequest(reply.url()))
                else:
                    style = gzip.decompress(data)
                    # from my research it is not possible to set a style unless you are formatting it as expected (as shown below)
                    # and load it from a file
                    # What can be done to add style as source???
                    # styleB64 = E(style, 'base64').decode('UTF-8')
                    # stylePyQt = "data:text/css;charset=utf-8;base64," + styleB64
                    # self.view.settings().setUserStyleSheetUrl(stylePyQt)
                    # self.view.page().main_frame().setContent()
            elif content == 'text/html':
                page = gzip.decompress(data)
                self.view.page().mainFrame().setHtml(page.decode('UTF-8'), self.url)
                # setting also the baseUrl will automatically load all the javaScripts and other links but these are received
                # in gzip format as well and are not decompressed because I can't alter the behaviour. As stated above, the reply
                # is empty when I attempt to read it so it is already loaded
                # Also using the inspector I can see that the source of the scripts is in compressed format
                # self.view.page().mainFrame().setHtml(page.decode('UTF-8'), self.url)
        else:
            if data:
                print('Page not in gzip format. Load it as usual')
                self.view.page().mainFrame().setHtml(str(data, encoding='UTF-8'), baseUrl=self.url)


def main():
    app = QApplication(sys.argv)
    url_ = 'https://www.safaribooksonline.com/'
    window = Window(url_)
    window.show()
    app.exec_()

if __name__ == "__main__":
    main()

0 个答案:

没有答案