Question

我想使用pyqt5获取该网页。

URL为https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html。

该网页将使用javascript生成两个值。

只需在文本框中输入5，然后按红色按钮即可。

将返回两个红色的值。

请参考图片。

下面的代码用于获取网页。

但是，我等了很长时间，没有任何反应。

我应该更改我的代码吗？

非常感谢您。

mockito tutorial

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup
import pandas as pd

class Render(QWebEngineView):
    def __init__(self, url):
        self.html = None
        self.first_pass = True
        self.app = QApplication(sys.argv)
        QWebEngineView.__init__(self)
        self.loadFinished.connect(self._load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _load_finished(self, result):
        if self.first_pass:
            self._first_finished()
            self.first_pass = False
        else:
            self._second_finished()

    def _first_finished(self):
        self.page().runJavaScript('document.getElementById("txtDistance").value = "5";')
        self.page().runJavaScript("void(0)")
        self.page().runJavaScript("CheckUserWhere();")

    def _second_finished(self):
        self.page().toHtml(self.callable)

    def callable(self, data):
        self.html = data
        self.app.quit()

url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
element = soup.find('div', {'id':"divResult"})
df = pd.read_html(str(element))

Answer 1

您似乎有一些误解：

执行js时，不会重新加载页面，因此_second_finished函数将永远不会被调用。
如果您不想显示该窗口，那么最好使用QWebEnginePage。

考虑到以上内容，获得的html是：

<div class="p8-5" id="divResult" style="display:block;">
<div align="center" display="block" id="rsloading" style="display: block;">
<img src="//img2.soufunimg.com/qyb/loading.gif"/>
                        正在为您加载数据...
                    </div>
<table border="0" cellpadding="0" cellspacing="0" class="tablebox01" display="none" id="tbResult" style="display: none;" width="600">
<tbody><tr>
<td style="width:260px;"><span class="gray8">建设用地面积：</span>14748平方米</td>
<td style="width:340px;"><span class="gray8">所在城市：</span>山西省 长治市 </td>
</tr>
<tr>
<td><span class="gray8">规划建筑面积：</span>51617平方米</td>
<td><span class="gray8">土地评估楼面价：</span><b class="redc00 font14" id="_bpgj">867.61</b> 元/平方米</td>
</tr>
<tr>
<td><span class="gray8">容积率：</span>大于1并且小于或等于3.5</td>
<td><span class="gray8">土地评估总价：</span><b class="redc00 font14" id="_bSumPrice">4478.34</b> 万元</td>
</tr>
<tr>
<td><span class="gray8">规划用途：</span>住宅用地</td>
<td><span class="gray8">推出楼面价：</span>27.51元/平方米</td>
</tr>
</tbody></table>
</div>

所以最简单的方法是按ID“ _bpgj”和“ _bSumPrice”进行过滤

import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
from bs4 import BeautifulSoup

class Render(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self, url):
        self.html = ""
        self.first_pass = True
        self.app = QtWidgets.QApplication(sys.argv)
        super(Render, self).__init__()
        self.loadFinished.connect(self._load_finished)
        self.loadProgress.connect(print)
        self.load(QtCore.QUrl(url))
        self.app.exec_()

    def _load_finished(self, result):
        if result:
            self.call_js()

    def call_js(self):
        self.runJavaScript('document.getElementById("txtDistance").value = "5";')
        self.runJavaScript("void(0)")
        self.runJavaScript("CheckUserWhere();")
        self.toHtml(self.callable)

    def callable(self, data):
        self.html = data
        self.app.quit()

url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
_bpgj = soup.find('b', {'id':"_bpgj"}).string
_bSumPrice = soup.find('b', {'id':"_bSumPrice"}).string
print(_bpgj, _bSumPrice)

输出：

867.61 4478.34

如何使用pyqt5获取网页？

1 个答案: