我使用PyQT4(第一次)刮掉一些页面。因为我尝试刮掉多个页面,所以我使用了QEventloop。但是我无法将loadFinished信号添加到代码中。以下是我的代码如下所示:
# Imports
import requests
from bs4 import BeautifulSoup
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import QNetworkRequest
from PyQt4.QtGui import *
from lxml import html
import csv
import win_unicode_console
import time
# Main setting
DIR = "data"
URL = "https://addons.mozilla.org"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
def Render(url):
page = QWebPage()
loop = QEventLoop() # Create event loop
page.mainFrame().loadFinished.connect(loop.quit) # Connect loadFinished to loop quit
page.mainFrame().load(QUrl(url))
loop.exec_() # Run event loop, it will end on loadFinished
return page.mainFrame().toHtml()
app = QApplication(sys.argv)
def pagination(page):
page_url = "https://addons.mozilla.org/en-US/firefox/extensions/?sort=users&page=" + str(page)
response = requests.get(page_url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
items = soup.findAll("div", class_="item addon")
for item in items:
time.sleep(2)
item = URL + item.h3.select('a')[0].get('href')
print(item)
addon_scraper(item)
def addon_scraper(url):
time.sleep(7)
result = Render(url)
print(result)
soup = BeautifulSoup(result, "lxml")
addon_name = soup.select("#addon > hgroup > h1 > span")[0].get_text()
print(addon_name)
addon_author = soup.select("#addon > hgroup > h4 > a")[0].get_text()
category = soup.select("#related > ul")[0].get_text().strip()
with open("category_list.csv", "a", newline="", encoding="utf-16") as f:
writer = csv.writer(f, dialect="excel-tab")
writer.writerow([addon_name, addon_author, category])
# Run the scraper
if __name__ == "__main__":
win_unicode_console.enable() # Enable unicode support in command line interface
for i in range(1, 100):
print(i)
pagination(i)
app.exit()
最后它只是重新启动脚本而什么都不做。我试图在这里实现用户Mip提供的解决方案:Web Scraping Multiple Links with PyQt / QtWebkit 我认为将用户代理添加到上面的应用程序和隐式睡眠(类似于selenium情况)将解决我的问题。但我无法做到这一点。 现在我收到以下错误。我认为这是因为PyQt4在加载源内容之前退出循环:
Traceback (most recent call last): File "main.py", line 56, in <module> pagination(i) File "mozilla_file.py", line 36, in pagination addon_scraper(item) File "mozilla_file.py", line 46, in addon_scraper category = soup.select("#related > ul")[0].get_text().strip() IndexError: list index out of range