PyQt4& BeautifulSoup设置浏览器窗口大小

时间:2017-03-24 20:47:35

标签: beautifulsoup pyqt4

我尝试使用PyQt4进行网页抓取,但我试图抓取的网站一直在想我是移动设备,并没有提供可用于桌面的数据集或笔记本电脑(即使我使用的是Mozilla / 5.0用户代理)。

试图找出我将网址设置为" whatsmyuseragent.com"的原因。而且我注意到它告诉我虽然我的屏幕分辨率是1920px x 1080px,但我的浏览器窗口大小是0px x 0px,这可能是问题吗?

以下是我的代码。任何关于我需要改变以说服网站的建议我都会感激地相信我是台式电脑或笔记本电脑(而不是手机)。感谢。

import sys
from PyQt4 import QtGui
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtNetwork import QNetworkRequest

import bs4 as bs
import urllib.request

class Client(QWebPage):

    def __init__ (self, url):
        self.app = QApplication(sys.argv)
        QWebPage.__init__(self)
        self.loadFinished.connect(self.on_page_load)
        self.request = QNetworkRequest()
        self.request.setUrl(QUrl(url))
        self.request.setRawHeader("User-Agent",'Mozilla/5.0')
        self.mainFrame().load(self.request)
        self.app.exec_()

    def on_page_load (self):
        self.app.quit()

url = 'http://www.whatsmyuseragent.com'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'lxml')

print(soup.prettify())

1 个答案:

答案 0 :(得分:1)

尝试设置Viewport Size

for(customer in customerArr){
   console.log(customerArr[customer].name <-- with out .name);    
}

这为我产生了以下内容:

import sys
import re
from PyQt4 import QtGui
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QSize
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtNetwork import QNetworkRequest

import bs4 as bs

class Client(QWebPage):

    def __init__ (self, url):
        self.app = QApplication(sys.argv)
        QWebPage.__init__(self)

        # good ol'size
        size = QSize(640, 480)
        self.setViewportSize(size)

        self.loadFinished.connect(self.on_page_load)
        self.request = QNetworkRequest()
        self.request.setUrl(QUrl(url))
        self.request.setRawHeader("User-Agent",'Mozilla/5.0')
        self.mainFrame().load(self.request)
        self.app.exec_()

    def on_page_load (self):
        self.app.quit()

url = 'http://www.whatsmyuseragent.com'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'lxml')

# some meat from the soup
print(re.sub('\s+', ' ', soup.find(class_='browser-window').text))