我一直无法找到一种有效的方法来捕获渲染的HTML(使用PyQt)并用美丽的汤解析所述HTML。也许我没有正确存储HTML。我尝试过使用请求,但到目前为止收效甚微。
提前致谢!
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
from bs4 import BeautifulSoup
import requests
import codecs
import time
import unicodecsv
reload(sys)
sys.setdefaultencoding("utf8")
title_strings = open("title_test.txt","rb")
output = "title_string_out.txt"
csv_writer = unicodecsv.writer((open(output, "wb")), delimiter="|", quotechar = '"', encoding = "utf-8")
row = []
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
titleURL = "URL_goes_here.html"
r = Render(titleURL)
result = r.frame.toHtml()
result = str(result)
soup = BeautifulSoup(result)
try:
newtitle = soup.find("div", id = "soupstuff").find_all("div", class_ = "name")
for n in newtitle:
n = n.find("a").text
row.append(str(newtitle))
except AttributeError:
pass
csv_writer.writerow(row)
time.sleep(5)