PyQt5 Web抓取不能在for循环中工作

时间:2017-11-19 21:01:12

标签: python web-scraping pyqt pyqt5 qtwebengine

我是Python的新手,试图从this上的第二个网页抓取一个javascript渲染网页中的一些数据。当我尝试将此代码应用于for循环时,它仅返回列表中包含50个项目的2个结果,并给出"Process finished with exit code -1073740940 (0xC0000374)"错误。有人可以解释原因吗?

我的样本在这里:

class Page(QWebEnginePage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)

    def Callable(self, html_str):
        self.html = html_str
        self.app.quit()


def main():
    global linklist
    for iurl in linklist:
        page = Page(iurl)
        soup = bs.BeautifulSoup(page.html, 'html.parser')
        data = soup.find('div', class_='tablo_dual_board')
        data = data.text
        data = data.splitlines()
        print(data)

我也尝试了这个,它仅为第一个列表项提供结果。除了这些之外还有其他方法可以为列表项应用函数吗?

for iurl in linklist:
    iurl=main()

我的整个代码在这里:

import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
import requests
from bs4 import BeautifulSoup
import bs4 as bs


class WebPage(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self):
        super(WebPage, self).__init__()
        self.loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext

    @property
    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QtCore.QUrl(url))
        return True

    def processCurrentPage(self, html):
        url = self.url().toString()
        # do stuff with html...
        soup = bs.BeautifulSoup(html, 'html.parser')
        veri = soup.find('div', class_='tablo_dual_board')
        veri = veri.text
        veri = veri.splitlines()
        print(veri)
        if not self.fetchNext:
            QtWidgets.qApp.quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def javaScriptConsoleMessage(self, *args):
        # disable javascript error output
        pass

if __name__ == '__main__':

    # generate some test urls

    onexurl = "https://1xbahis1.com/en/live/Football/"
    r = requests.get(onexurl)
    soup = BeautifulSoup(r.content, "html.parser")
    income = soup.find_all("ul", {"id":"games_content"})
    links = soup.find_all("a", {"class": "c-events__name"})

    urls = []
    for matchlink in links:
        urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))

    # only try 3 urls for testing
    urls = urls[:3]

    app = QtWidgets.QApplication(sys.argv)
    webpage = WebPage()
    webpage.start(urls)
    sys.exit(app.exec_())

0 个答案:

没有答案