外部循环通过PyQt5类,通过给定的URL

时间:2018-04-07 12:37:41

标签: python-3.x beautifulsoup pyqt5

我对Python很陌生,我想编写一个脚本,我可以从网站上获取一些细节,如你所知,有些网站使用JavaScript来完成页面的内容,所以我已经尝试过PhantomJS但没有成功,所以我换到了PYQT5,我遇到了一个问题,我想你可以帮我解决它的问题。

我的Python脚本适用于一次迭代,但是如果我把它放在一个循环中它会崩溃python。

我试过time.sleep(),但它似乎无法发挥作用。

将我的问题标记为重复的尖锐答案 Scrape multiple urls using QWebPage 不是出于同一目的,给定的答案循环与已经给定的URL列表在课堂上,而在我的情况下,它应该在外部。

这是我的剧本:

"""Render HTML for scraping"""
# -*- coding: utf-8 -*-

import os
import sys
import requests
from contextlib import contextmanager
from multiprocessing import Pool
import bs4
import time

try:
    TimeoutError
except NameError:
    from multiprocessing import TimeoutError  # Python 2


def _render(url):
    """Return rendered HTML."""
    try:
        from PyQt5.QtCore import QEventLoop,QUrl
        from PyQt5.QtWebEngineWidgets import QWebEngineView
        from PyQt5.QtWidgets import QApplication

        class Render(QWebEngineView): 
            def __init__(self, url):
                self.html = None
                self.app = QApplication.instance()
                if self.app is None:
                    self.app = QApplication(sys.argv)
                else:
                    print('QApplication instance already exists: %s' % str(app))
                #-----------------------------
                #self.app = QApplication(sys.argv)
                QWebEngineView.__init__(self)
                self.loadFinished.connect(self._loadFinished)
                #self.setHtml(html)
                self.load(QUrl(url))
                while self.html is None:
                    self.app.processEvents(
                        QEventLoop.ExcludeUserInputEvents |
                        QEventLoop.ExcludeSocketNotifiers |
                        QEventLoop.WaitForMoreEvents)
                self.app.quit()

            def _callable(self, data):
                self.html = data

            def _loadFinished(self, result):
                self.page().toHtml(self._callable)
    except ImportError:
        from PyQt5.QtWebKitWidgets import QWebPage
        from PyQt5.QtWidgets import QApplication

        class Render(QWebPage):
            def __init__(self, html):
                self.html = None
                self.app = QApplication(sys.argv)
                QWebPage.__init__(self)
                self.loadFinished.connect(self._loadFinished)
                self.mainFrame().setHtml(html)
                self.app.exec_()

            def _loadFinished(self, result):
                self.html = self.mainFrame().toHtml()
                self.app.quit()

    with devnull():
        return Render(url).html


@contextmanager
def devnull():
    """Temporarily redirect stdout and stderr to /dev/null."""

    try:
        original_stderr = os.dup(sys.stderr.fileno())
        original_stdout = os.dup(sys.stdout.fileno())
        null = open(os.devnull, 'w')
        os.dup2(null.fileno(), sys.stderr.fileno())
        os.dup2(null.fileno(), sys.stdout.fileno())
        yield

    finally:
        if original_stderr is not None:
            os.dup2(original_stderr, sys.stderr.fileno())
        if original_stdout is not None:
            os.dup2(original_stdout, sys.stdout.fileno())
        if null is not None:
            null.close()

urllist=[
"https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/",
"https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/",
"https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/",
"https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/"]
for url in urllist:
    time.sleep(1)
    rendered_html = _render(url)
    # get the BeautifulSoup
    soup = bs4.BeautifulSoup(rendered_html, 'html.parser')
    print('title is %r' % soup.select_one('title').text)
  

虽然这个问题被标记为重复(感谢所有人这样做),但我想分享我的工作脚本,以便将来可能有用   无论如何,现在我想知道如何为这个 QWebEnginePage 设置假窗口大小我可以得到一些帮助

import bs4
import os
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
from contextlib import contextmanager
from multiprocessing import Pool

def _render(url):
    class WebPage(QtWebEngineWidgets.QWebEnginePage):
        def __init__(self):
            super(WebPage, self).__init__()
            self.loadFinished.connect(self.handleLoadFinished)
            self.html = None

        def start(self, url):
            self._url = url
            self.load(QtCore.QUrl(url))
            while self.html is None:
                QtWidgets.qApp.processEvents(
                QtCore.QEventLoop.ExcludeUserInputEvents |
                QtCore.QEventLoop.ExcludeSocketNotifiers |
                QtCore.QEventLoop.WaitForMoreEvents)
            QtWidgets.qApp.quit()

        def processCurrentPage(self, data):
            url = self.url().toString()
            self.html = data

        def handleLoadFinished(self):
            self.toHtml(self.processCurrentPage)

    with devnull():
        webpage = WebPage()
        webpage.start(url)
        return webpage.html

@contextmanager
def devnull():
    """Temporarily redirect stdout and stderr to /dev/null."""
    try:
        original_stderr = os.dup(sys.stderr.fileno())
        original_stdout = os.dup(sys.stdout.fileno())
        null = open(os.devnull, 'w')
        os.dup2(null.fileno(), sys.stderr.fileno())
        os.dup2(null.fileno(), sys.stdout.fileno())
        yield

    finally:
        if original_stderr is not None:
            os.dup2(original_stderr, sys.stderr.fileno())
        if original_stdout is not None:
            os.dup2(original_stdout, sys.stdout.fileno())
        if null is not None:
            null.close()


urllist=[
"https://www.tutorialspoint.com/python/index.htm",
"https://www.tutorialspoint.com/python/index.htm",
"https://www.tutorialspoint.com/python/index.htm",
"https://www.tutorialspoint.com/python/index.htm"]
app = QtWidgets.QApplication(sys.argv)
for url in urllist:   
    rendered_html = _render(url)
    soup = bs4.BeautifulSoup(rendered_html, 'html.parser')
    print('title is %r' % soup.select_one('title').text)

0 个答案:

没有答案