用美丽的汤解析PyQt输出(用JS渲染)

时间:2015-03-22 20:02:47

标签: python parsing pyqt beautifulsoup python-requests

我一直无法找到一种有效的方法来捕获渲染的HTML(使用PyQt)并用美丽的汤解析所述HTML。也许我没有正确存储HTML。我尝试过使用请求,但到目前为止收效甚微。

提前致谢!

import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *  
from lxml import html 
from bs4 import BeautifulSoup
import requests
import codecs
import time
import unicodecsv

reload(sys)
sys.setdefaultencoding("utf8")

title_strings = open("title_test.txt","rb")

output = "title_string_out.txt"
csv_writer = unicodecsv.writer((open(output, "wb")), delimiter="|", quotechar = '"', encoding = "utf-8")

row = []

class Render(QWebPage):  
    def __init__(self, url):  
        self.app = QApplication(sys.argv)  
        QWebPage.__init__(self)  
        self.loadFinished.connect(self._loadFinished)  
        self.mainFrame().load(QUrl(url))  
        self.app.exec_()  

    def _loadFinished(self, result):  
        self.frame = self.mainFrame()  
        self.app.quit()  

titleURL = "URL_goes_here.html"
r = Render(titleURL)  
result = r.frame.toHtml()
result = str(result)
soup = BeautifulSoup(result) 

try:
    newtitle = soup.find("div", id = "soupstuff").find_all("div", class_ = "name")
    for n in newtitle:
        n = n.find("a").text
        row.append(str(newtitle))
except AttributeError:
    pass

csv_writer.writerow(row)

time.sleep(5)

0 个答案:

没有答案