Python WebScraping关闭未完成且未给出错误

时间:2018-11-10 00:40:47

标签: python web-scraping pyqt pyqt5

我正在做一个简单的WebScraping,下载站点的某些冠军项目的图像,我输入了一个“ for”字符(含5个字符),它只执行其中的2个字符,然后关闭而没有给出任何错误! >

import bs4 as bs
import sys,os
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl

class Page(QWebEnginePage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        print("#1 __init__")
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)
        print('#2 On Load finished')

    def Callable(self, html_str):
        print("#3 Callable\n")
        self.html = html_str
        self.app.quit()

def already_exist(image_name):
    for _, _, folder in os.walk('Images'):
        if image_name in folder:
            return False
        else:
            return True

def ImageDownload(url):
    image_name = url.split("/")
    try:
        if already_exist(image_name[-1]):
            full_path = "Images/" + image_name[-1]
            urllib.request.urlretrieve(url, full_path)
            print("Download %s" % image_name)
        else:
            print("Image already Downloaded >: %s" % image_name[-1])
    except:
        print("Error Download")

def main():
    champions = ['Amumu','Akali','Zed','Nunu'] #champions
    for champ in champions:
        try:
            print("\nDownloading Images >: %s"% champ)
            data = Page('https://www.probuilds.net/champions/details/%s' % champ.strip())
            soup = bs.BeautifulSoup(data.html, 'html.parser')
            items = soup.find_all('div',{'class':'items'})
            for photos in items:
                images = photos.find_all('img')
                for image in images:
                    ImageDownload(image['src'])
        except:
            print("Shi...")

main()

我没有收到任何错误,但该程序仅执行了2次,这就是问题所在,有人帮助了我!

1 个答案:

答案 0 :(得分:0)

似乎QWebEnginePage无法正确关闭,建议也重用而不是创建另一个QWebEnginePage,因此,以old answer为基础,我实现了以下解决方案:

import os
import sys
import bs4 as bs
import urllib.request
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets

class WebPage(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self):
        super(WebPage, self).__init__()
        self.loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QtCore.QUrl(url))
        return True

    def processCurrentPage(self, html):
        self.process(self.url(), html)
        if not self.fetchNext():
            QtWidgets.qApp.quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def process(self, url, html):
        print('loaded: [%d chars] %s' % (len(html), url.toString()))

class ScrapePage(WebPage):
    def __init__(self):
        super(ScrapePage, self).__init__()
        self.results = set()

    def process(self, url, html):
        soup = bs.BeautifulSoup(html, 'html.parser')
        items = soup.find_all('div',{'class':'items'})
        for photos in items:
            images = photos.find_all('img')
            for image in images:
                self.results.add(image['src'])

def already_exist(image_name):
    for _, _, folder in os.walk('Images'):
        if image_name in folder:
            return False
        else:
            return True

def ImageDownload(url):
    image_name = url.split("/")
    try:
        if already_exist(image_name[-1]):
            full_path = "Images/" + image_name[-1]
            urllib.request.urlretrieve(url, full_path)
            print("Download %s" % image_name)
        else:
            print("Image already Downloaded >: %s" % image_name[-1])
    except:
        print("Error Download")

if __name__ == '__main__':

    app = QtWidgets.QApplication(sys.argv)
    webpage = ScrapePage()

    champions = ['Amumu','Akali','Zed','Nunu']
    base_url = 'https://www.probuilds.net/champions/details/'

    urls = []
    for champ in champions:
        url = QtCore.QUrl(base_url).resolved(QtCore.QUrl(champ))
        urls.append(url)
    webpage.start(urls)
    app.exec_()
    for url in webpage.results:
        ImageDownload(url)