PyQt5刮掉IMDb网页

时间:2018-04-17 13:02:01

标签: python pyqt5

我现在开始使用python进行Web Scraping,我想从这个link中删除图像。这是screenshot of "Inspect"。 这是我尝试的代码,因为它涉及JavaScript。

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl

class Page(QWebEnginePage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)
        print('Load finished')

    def Callable(self, html_str):
        self.html = html_str
        self.app.quit()


def main():
    page = Page('https://www.imdb.com/name/nm0005683/mediaviewer/rm2073384192')
    soup = bs.BeautifulSoup(page.html, 'html.parser')
    imagetag = soup.find('div', id='photo-container')
    print (imagetag)

if __name__ == '__main__': main()

此代码实际上来自here,我只更改了链接

我得到的错误

js: Uncaught TypeError: Cannot read property 'x' of undefined
Load finished
<div id="photo-container"></div>

我不知道错误究竟是什么,isn的内容没有出现我确实尝试使用谷歌搜索错误但是找不到任何可以帮助解决这种情况的事情。此外,如果我应该尝试任何其他方法来刮取图像而不是这个,我也会对这些建议持开放态度。

PS:我也是StackOverFlow的新手,所以如果这里的任何内容不符合规则,我可以根据需要编辑问题。

1 个答案:

答案 0 :(得分:0)

您可能希望使用网络通道来完成实际工作,但以下内容将向您展示如何访问您要查找的图像。我将为您留下网络渠道研究。

import sys
from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl, QTimer

class Page(QWebEnginePage):
    def __init__(self, parent):
        QWebEnginePage.__init__(self, parent)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)

    def _on_load_finished(self):
        print('Load finished')
        QTimer.singleShot(1000, self._after_loading)  # load finished does not mean rendered..may need to wait here
        QTimer.singleShot(5000, self._exit)

    def _after_loading(self):
        print('_after_loading')
        js = '''console.log('javascript...');
        var images = document.querySelectorAll('#photo-container img');
        console.log('images ' + images);
        console.log('images ' + images.length);
        for (var i = 0; i < images.length; i++)
        {
            var image = images[i];
            console.log(image.src);
        }        
        var element = document.querySelector('body');
        //console.log(element.innerHTML);  // If you uncomment this you'll see the the photo-container is still empty
        '''
        self.runJavaScript(js)
        print('_after_loading...done')

    def _exit(self):
        print('_exit')
        QApplication.instance().quit()

    def javaScriptConsoleMessage(self, level: QWebEnginePage.JavaScriptConsoleMessageLevel, message: str, lineNumber: int, sourceID: str):
        print(message)

def main():
    app = QApplication(sys.argv)
    w = QWebEngineView()
    w.setPage(Page(w))
    w.load(QUrl('https://www.imdb.com/name/nm0005683/mediaviewer/rm2073384192'))
    w.show()
    app.exec_()

if __name__ == '__main__': main()