如何使用pyqt5抓取数据

时间:2019-02-08 09:25:46

标签: python pyqt5

我想从website获取弹出数据。

如第一幅图所示,我需要单击一个链接。

此后,将出现一个弹出窗口,如第二个图所示。

此弹出窗口的内容正是我想要的。

我尝试使用pyqyt5来跟踪示例,以获取数据。

但是,该程序将继续永久运行。

如何解决这个问题?

非常感谢您。

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup


class Render(QWebEngineView):
    def __init__(self, url):
        self.html = None
        self.first_pass = True
        self.app = QApplication(sys.argv)
        QWebEngineView.__init__(self)
        self.loadFinished.connect(self._load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _load_finished(self, result):        
        if self.first_pass:
            self._first_finished()
            self.first_pass = False
        else:
            self._second_finished()

    def _first_finished(self):
        self.page().runJavaScript("document.getElementById('auto-header-citypop-citylist');")


    def _second_finished(self):
        self.page().toHtml(self.callable)

    def callable(self, data):
        self.html = data
        self.app.quit()

url = r'https://www.autohome.com.cn'
web = Render(url)

with open('data2.html', 'w', encoding='utf-8-sig') as f:
    f.write(web.html)

enter image description here enter image description here

1 个答案:

答案 0 :(得分:1)

页面上有这个JavaScript:

       if (rf === "" || rf.toLocaleLowerCase().indexOf(".autohome.com.cn") === -1) {
            if (screen == undefined || screen.width < 810) {
                if (browser.versions.mobile == true || browser.versions.iPhone == true || browser.versions.ucweb == true || browser.versions.android == true || browser.versions.Symbian == true) {
                    window.location.href = "//m.autohome.com.cn/?from=pc";
                    return
                }
            }
        }  

通过打印self.url()可以看到,这将您重定向到https://m.autohome.com.cn/?from=pc。为了解决这个问题,我将Referer标头设置如下:

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl, QByteArray
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWebEngineCore import QWebEngineHttpRequest
from bs4 import BeautifulSoup


class Render(QWebEngineView):
    def __init__(self, url):
    self.html = None
    self.app = QApplication(sys.argv)
    QWebEngineView.__init__(self)
    self.loadFinished.connect(self._load_finished)
    self.request = QWebEngineHttpRequest(QUrl(url))
    self.request.setHeader(QByteArray().append('Referer'), QByteArray().append('https://www.autohome.com.cn/beijing/'))
    self.load(self.request)
    self.app.exec_()

    def _load_finished(self, result):  
    self.page().runJavaScript("document.getElementById('auto-header-switcharea').click();")
    self.page().toHtml(self.callable)


    def callable(self, data):
    self.html = data
    self.app.quit()

url = 'https://www.autohome.com.cn/beijing/'
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
for city in soup.find_all('a', {'name':'auto-header-citypop-city'}):
    print(city)

输出:

<a data-info="[110100, 646, '北京', 'beijing']" data-key="110100" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">北京</a>
<a data-info="[440100, 62, '广州', 'guangzhou']" data-key="440100" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">广州</a>
<a data-info="[440300, 670, '深圳', 'shenzhen']" data-key="440300" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">深圳</a>
<a data-info="[320100, 335, '南京', 'nanjing']" data-key="320100" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">南京</a>
<a data-info="[310100, 649, '上海', 'shanghai']" data-key="310100" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">上海</a>

....

click事件后没有页面加载,因此不需要两个_load_finished方法。