我已将HTML加载到pyqt中,并希望创建页面上所有内容的列表。
然后,我需要能够使用.geometry()
我想要一个对象列表,其中可以使用以下内容:
for i in list_of_content_in_html:
print i.toPlainText(), i.geometry() #prints the text, and the position.
如果我不清楚,“内容”我的意思是在下面的HTML中,内容是 'c','r1 c1','r1,c2','row2 c2','more contents' - 网络用户在浏览器中看到的文本,基本上。
c
<table border="1">
<tr>
<td>r1 c1</td>
<td>r1 c2</td>
</tr>
<tr>
<td></td>
<td>row2 c2</td>
</tr>
</table>
more contents
答案 0 :(得分:2)
这似乎不可能使用QtWebKit和类似这样的页面嵌套对象但不使用<p>...</p>
用于表格之外的其他文本。结果c
和more contents
不会进入单独的QWebElements。它们只能在BODY级别块中找到。作为解决方案,可以通过解析器运行该页面。只需遍历currentFrame documentElement的子节点即可显示以下元素:
# position in element tree, bounding box, tag, text:
(0, 0) [0, 0, 75, 165] HTML - u'c\nr1 c1\tr1 c2\nrow2 c2\nmore contents'
(1, 1) [8, 8, 67, 157] BODY - u'c\nr1 c1\tr1 c2\nrow2 c2\nmore contents'
(2, 0) [8, 27, 75, 119] TABLE - u'r1 c1\tr1 c2\nrow2 c2'
(3, 0) [9, 28, 74, 118] TBODY - u'r1 c1\tr1 c2\nrow2 c2'
(4, 0) [9, 30, 74, 72] TR - u'r1 c1\tr1 c2'
(5, 0) [11, 30, 32, 72] TD - u'r1 c1'
(5, 1) [34, 30, 72, 72] TD - u'r1 c2'
(4, 1) [9, 74, 74, 116] TR - u'row2 c2'
(5, 1) [34, 74, 72, 116] TD - u'row2 c2'
此代码:
import sys
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
class WebPage(QObject):
finished = Signal()
def __init__(self, data, parent=None):
super(WebPage, self).__init__(parent)
self.output = []
self.data = data
self.page = QWebPage()
self.page.loadFinished.connect(self.process)
def start(self):
self.page.mainFrame().setHtml(self.data)
@Slot(bool)
def process(self, something=False):
self.page.setViewportSize(self.page.mainFrame().contentsSize())
frame = self.page.currentFrame()
elem = frame.documentElement()
self.gather_info(elem)
self.finished.emit()
def gather_info(self, elem, i=0):
if i > 200: return
cnt = 0
while cnt < 100:
s = elem.toPlainText()
rect = elem.geometry()
name = elem.tagName()
dim = [rect.x(), rect.y(),
rect.x() + rect.width(), rect.y() + rect.height()]
if s: self.output.append(dict(pos=(i, cnt), dim=dim, tag=name, text=s))
child = elem.firstChild()
if not child.isNull():
self.gather_info(child, i+1)
elem = elem.nextSibling()
if elem.isNull():
break
cnt += 1
webpage = None
def print_strings():
for s in webpage.output:
print s['pos'], s['dim'], s['tag'], '-', repr(s['text'])
if __name__ == '__main__':
app = QApplication(sys.argv)
data = open(sys.argv[1]).read()
webpage = WebPage(data)
webpage.finished.connect(print_strings)
webpage.start()
另一种方法
期望的行动取决于您想要达到的目标。您可以使用QWebPage
从webpage.currentFrame().documentElement().toPlainText()
获取所有字符串,但这只是将整个页面显示为字符串,没有与所有标记相关的定位信息。浏览QWebElement
树会为您提供所需的信息,但它有一些缺点,我在上面提到过。
如果您真的想知道所有文本的位置,那么唯一准确的方法(除了渲染页面和使用OCR之外)将文本分解为字符并保存其各自的边界框 。我是这样做的:
首先,我使用BeautifulSoup4解析页面,并将X
中的每个非空格文本字符<span class="Nd92KSx3u2">X</span>
括起来。然后我运行了一个PyQt脚本(实际上是一个PySide脚本),在我使用findAllElements('span[class="Nd92KSx3u2"]')
查找它们之后加载已更改的页面并用它们的边界框打印出字符。
parser.py:
import sys, cgi, re
from bs4 import BeautifulSoup, element
magical_class = "Nd92KSx3u2"
restricted_tags="title script object embed".split()
re_my_span = re.compile(r'<span class="%s">(.+?)</span>' % magical_class)
def no_nl(s): return str(s).replace("\r", "").replace("\n", " ")
if len(sys.argv) != 3:
print "Usage: %s <input_html_file> <output_html_file>" % sys.argv[0]
sys.exit(1)
def process(elem):
for x in elem.children:
if isinstance(x, element.Comment): continue
if isinstance(x, element.Tag):
if x.name in restricted_tags:
continue
if isinstance(x, element.NavigableString):
if not len(no_nl(x.string).strip()):
continue # it's just empty space
print '[', no_nl(x.string).strip(), ']', # debug output of found strings
s = ""
for c in x.string:
if c in (' ', '\r', '\n', '\t'): s += c
else: s += '<span class="%s">%s</span>' % (magical_class, c)
x.replace_with(s)
continue
process(x)
soup = BeautifulSoup(open(sys.argv[1]))
process(soup)
output = re_my_span.sub(r'<span class="%s">\1</span>' % magical_class, str(soup))
with open(sys.argv[2], 'w') as f:
f.write(output)
charpos.py:
import sys
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
magical_class = "Nd92KSx3u2"
class WebPage(QObject):
def __init__(self, data, parent=None):
super(WebPage, self).__init__(parent)
self.output = []
self.data = data
self.page = QWebPage()
self.page.loadFinished.connect(self.process)
def start(self):
self.page.mainFrame().setHtml(self.data)
@Slot(bool)
def process(self, something=False):
self.page.setViewportSize(self.page.mainFrame().contentsSize())
frame = self.page.currentFrame()
elements = frame.findAllElements('span[class="%s"]' % magical_class)
for e in elements:
s = e.toPlainText()
rect = e.geometry()
dim = [rect.x(), rect.y(),
rect.x() + rect.width(), rect.y() + rect.height()]
if s and rect.width() > 0 and rect.height() > 0: print dim, s
if __name__ == '__main__':
app = QApplication(sys.argv)
data = open(sys.argv[1]).read()
webpage = WebPage(data)
webpage.start()
input.html(略有改动以显示简单字符串转储的更多问题:
a<span>b<span>c</span></span>
<table border="1">
<tr><td>r1 <font>c1</font> </td><td>r1 c2</td></tr>
<tr><td></td><td>row2 & c2</td></tr>
</table>
more <b>contents</b>
和测试运行:
$ python parser.py input.html temp.html
[ a ] [ b ] [ c ] [ r1 ] [ c1 ] [ r1 c2 ] [ row2 & c2 ] [ more ] [ contents ]
$ charpos.py temp.html
[8, 8, 17, 26] a
[17, 8, 26, 26] b
[26, 8, 34, 26] c
[13, 48, 18, 66] r
[18, 48, 27, 66] 1
[13, 67, 21, 85] c
[21, 67, 30, 85] 1
[36, 48, 41, 66] r
[41, 48, 50, 66] 1
[36, 67, 44, 85] c
[44, 67, 53, 85] 2
[36, 92, 41, 110] r
[41, 92, 50, 110] o
[50, 92, 61, 110] w
[61, 92, 70, 110] 2
[36, 111, 47, 129] &
[51, 111, 59, 129] c
[59, 111, 68, 129] 2
[8, 135, 21, 153] m
[21, 135, 30, 153] o
[30, 135, 35, 153] r
[35, 135, 44, 153] e
[8, 154, 17, 173] c
[17, 154, 27, 173] o
[27, 154, 37, 173] n
[37, 154, 42, 173] t
[42, 154, 51, 173] e
[51, 154, 61, 173] n
[61, 154, 66, 173] t
[66, 154, 75, 173] s
查看边界框,如果您愿意,它(在这个简单的情况下不会改变字体大小和下标等内容)很容易将它们粘合成单词。
答案 1 :(得分:1)
我解决了。
for elem in QWebView().page().currentFrame().documentElement().findAll('*'):
print unicode(elem.toPlainText()), unicode(elem.geometry().getCoords()), '\n'
匹配任何内容,然后遍历找到的内容 - 从而迭代DOM树。