我想抓一些网站。为了做到这一点,我选择了PyQt4,因为QtWebKit模块是基于WebKit构建的,而且我还有可能做到无头,并且我需要安装的模块少得多。一切都按预期工作,但如果我想要请求网站和所有资源进行压缩("接受编码"," gzip")我遇到了麻烦。
加载html页面不是问题,但是在尝试加载.js或.css文件时会出现问题(因为.css并不重要,因为我将在无头模式下进行大部分报废)。但是对于.js文件很重要,因为内容未加载。主要问题是我想以压缩格式接收.js文件(如果网站支持),但我不知道如何将它们添加为主网页的资源/脚本。如果我不处理压缩,PyQt不会自动执行,因此.js脚本以.gzip格式加载而不被解释。
下面是一个示例,并使用随机网站以gzip格式发送hmtl和资源。使用PyQt的检查器实用程序(右键单击 - > Inspect),我们可以在选项卡" Resources - >中看到。框架 - >脚本/样式表"这些是压缩的而不是解释的。
在解压缩后,是否可以将.js / .css文件添加为网页资源?我该怎么办?我尝试了很多事情,但我没有想法。
如果问题出在PyQt4上,我可以接受它并将我的代码迁移到其他地方。
import sys
import gzip
import re
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import QNetworkRequest, QNetworkAccessManager, QNetworkReply
class Window(QWidget):
def __init__(self, url):
super(Window, self).__init__()
self.view = QWebView(self)
self.page = self.view.page()
self.main_frame = self.page.mainFrame()
self.view.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
self.view.settings().setAttribute(QWebSettings.PluginsEnabled, True)
self.view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True)
self.view.settings().setAttribute(QWebSettings.LocalContentCanAccessFileUrls, True)
self.url = url
self.manager = MyNetworkAccessManager(url, self.view)
self.page.setNetworkAccessManager(self.manager)
# using this method to load the page will bypass my implementation to uncompress and just display
# the compress content - which is just garbage
# self.view.load(QUrl(url))
self.main_frame.loadFinished.connect(self.loadFinished)
self.setupInspector()
self.splitter = QSplitter(self)
self.splitter.setOrientation(Qt.Vertical)
layout = QVBoxLayout(self)
layout.setMargin(0)
layout.addWidget(self.splitter)
self.splitter.addWidget(self.view)
self.splitter.addWidget(self.webInspector)
def loadFinished(self, *args, **kwargs):
print('Size: {}'.format(self.page.bytesReceived()))
def setupInspector(self):
page = self.view.page()
page.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
self.webInspector = QWebInspector(self)
self.webInspector.setPage(page)
shortcut = QShortcut(self)
shortcut.setKey(Qt.Key_F12)
shortcut.activated.connect(self.toggleInspector)
self.webInspector.setVisible(False)
def toggleInspector(self):
self.webInspector.setVisible(not self.webInspector.isVisible())
match_all = re.compile('^((?!safaribooksonline).)*$|.+(image|data|img|twitter|vpn|facebook).+')
class MyNetworkAccessManager(QNetworkAccessManager):
def __init__(self, url, view):
QNetworkAccessManager.__init__(self)
self.url = QUrl(url)
self.request = QNetworkRequest(self.url)
self.view = view
# making the request from here will use my implementation to decompress the data
self.get(self.request)
self.finished.connect(self.readFinishGzip)
def createRequest(self, operation, request, data=None):
# ignore links of no interest
if not match_all.match(request.url().toString()):
print("mymanager handles ", request.url())
request.setRawHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
request.setRawHeader('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0')
request.setRawHeader("Accept-Language", "en-GB,en;q=0.5")
request.setRawHeader("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7")
request.setRawHeader("Content-Type", "application/x-www-form-urlencoded")
request.setRawHeader("Accept-Encoding", "gzip")
request.setRawHeader("Connection", "keep-alive")
# here the reply should be received, but not a lot of things can be done
# first of all here you can't know when the reply is ready, unless you use a specific wait and retry read
# but then it is too late, the page javaScript will still not be loaded
# it is also mandatory to return a QNetworkReply
return QNetworkAccessManager.createRequest(self, operation, request, data)
return QNetworkAccessManager.createRequest(self, QNetworkAccessManager.GetOperation, QNetworkRequest(QUrl()), None)
def readFinishGzip(self, reply):
url = reply.url()
content = reply.rawHeader(b'Content-Type').data().decode('UTF-8').split(';')[0]
encode = reply.rawHeader(b'Content-Encoding')
data = reply.readAll()
# print('{} - {} - {}'.format(url, content, encode))
if encode.data().decode('UTF-8') == 'gzip':
print('Gzip page: checking content.')
if content == 'text/javascript':
if data == b'':
# if the page is set using "setHtml" method with the "baseUrl" keyword then the content
# of js scripts is empty because most probably is set automatically by PyQt and after
# a "readAll" the reply is empty. Because of this I am making a second request in order
# to get the data. But even so I don't know how to add it as resource/scripts to the mainFrame
pass
# self.get(QNetworkRequest(reply.url())).readAll().data()
else:
js = gzip.decompress(data)
# Evaluating the code as javaScript doesn't do anything and it doesn't set it as decompressed
# resource for the page. using "setContent" just overrides the current page content and displays
# the javaScript as plain text
# What can be done to add the javaScript as source???
# print(self.view.page().mainFrame().evaluateJavaScript(js.decode('UTF-8')))
# self.view.page().mainFrame().setContent(js, 'text/javascript')
elif content == 'text/css':
if data == b'':
pass
# the same scenario as in case of javaScript content
# self.get(QNetworkRequest(reply.url()))
else:
style = gzip.decompress(data)
# from my research it is not possible to set a style unless you are formatting it as expected (as shown below)
# and load it from a file
# What can be done to add style as source???
# styleB64 = E(style, 'base64').decode('UTF-8')
# stylePyQt = "data:text/css;charset=utf-8;base64," + styleB64
# self.view.settings().setUserStyleSheetUrl(stylePyQt)
# self.view.page().main_frame().setContent()
elif content == 'text/html':
page = gzip.decompress(data)
self.view.page().mainFrame().setHtml(page.decode('UTF-8'), self.url)
# setting also the baseUrl will automatically load all the javaScripts and other links but these are received
# in gzip format as well and are not decompressed because I can't alter the behaviour. As stated above, the reply
# is empty when I attempt to read it so it is already loaded
# Also using the inspector I can see that the source of the scripts is in compressed format
# self.view.page().mainFrame().setHtml(page.decode('UTF-8'), self.url)
else:
if data:
print('Page not in gzip format. Load it as usual')
self.view.page().mainFrame().setHtml(str(data, encoding='UTF-8'), baseUrl=self.url)
def main():
app = QApplication(sys.argv)
url_ = 'https://www.safaribooksonline.com/'
window = Window(url_)
window.show()
app.exec_()
if __name__ == "__main__":
main()