如何使用pyqt5获取网页?

时间:2019-04-07 12:20:05

标签: javascript python pyqt5

我想使用pyqt5获取该网页。

URL为https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html

该网页将使用javascript生成两个值。

只需在文本框中输入5,然后按红色按钮即可。

将返回两个红色的值。

请参考图片。

下面的代码用于获取网页。

但是,我等了很长时间,没有任何反应。

我应该更改我的代码吗?

非常感谢您。

mockito tutorial

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup
import pandas as pd

class Render(QWebEngineView):
    def __init__(self, url):
        self.html = None
        self.first_pass = True
        self.app = QApplication(sys.argv)
        QWebEngineView.__init__(self)
        self.loadFinished.connect(self._load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _load_finished(self, result):
        if self.first_pass:
            self._first_finished()
            self.first_pass = False
        else:
            self._second_finished()

    def _first_finished(self):
        self.page().runJavaScript('document.getElementById("txtDistance").value = "5";')
        self.page().runJavaScript("void(0)")
        self.page().runJavaScript("CheckUserWhere();")

    def _second_finished(self):
        self.page().toHtml(self.callable)

    def callable(self, data):
        self.html = data
        self.app.quit()

url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
element = soup.find('div', {'id':"divResult"})
df = pd.read_html(str(element))

enter image description here

1 个答案:

答案 0 :(得分:1)

您似乎有一些误解:

  • 执行js时,不会重新加载页面,因此_second_finished函数将永远不会被调用。
  • 如果您不想显示该窗口,那么最好使用QWebEnginePage。

考虑到以上内容,获得的html是:

<div class="p8-5" id="divResult" style="display:block;">
<div align="center" display="block" id="rsloading" style="display: block;">
<img src="//img2.soufunimg.com/qyb/loading.gif"/>
                        正在为您加载数据...
                    </div>
<table border="0" cellpadding="0" cellspacing="0" class="tablebox01" display="none" id="tbResult" style="display: none;" width="600">
<tbody><tr>
<td style="width:260px;"><span class="gray8">建设用地面积:</span>14748平方米</td>
<td style="width:340px;"><span class="gray8">所在城市:</span>山西省 长治市 </td>
</tr>
<tr>
<td><span class="gray8">规划建筑面积:</span>51617平方米</td>
<td><span class="gray8">土地评估楼面价:</span><b class="redc00 font14" id="_bpgj">867.61</b> 元/平方米</td>
</tr>
<tr>
<td><span class="gray8">容积率:</span>大于1并且小于或等于3.5</td>
<td><span class="gray8">土地评估总价:</span><b class="redc00 font14" id="_bSumPrice">4478.34</b> 万元</td>
</tr>
<tr>
<td><span class="gray8">规划用途:</span>住宅用地</td>
<td><span class="gray8">推出楼面价:</span>27.51元/平方米</td>
</tr>
</tbody></table>
</div>

所以最简单的方法是按ID“ _bpgj”和“ _bSumPrice”进行过滤

import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
from bs4 import BeautifulSoup

class Render(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self, url):
        self.html = ""
        self.first_pass = True
        self.app = QtWidgets.QApplication(sys.argv)
        super(Render, self).__init__()
        self.loadFinished.connect(self._load_finished)
        self.loadProgress.connect(print)
        self.load(QtCore.QUrl(url))
        self.app.exec_()

    def _load_finished(self, result):
        if result:
            self.call_js()

    def call_js(self):
        self.runJavaScript('document.getElementById("txtDistance").value = "5";')
        self.runJavaScript("void(0)")
        self.runJavaScript("CheckUserWhere();")
        self.toHtml(self.callable)

    def callable(self, data):
        self.html = data
        self.app.quit()

url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
_bpgj = soup.find('b', {'id':"_bpgj"}).string
_bSumPrice = soup.find('b', {'id':"_bSumPrice"}).string
print(_bpgj, _bSumPrice)

输出:

867.61 4478.34