这是我为Python 3.3运行的代码:
from html.parser import HTMLParser
class TableParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.in_table = False
self.in_table_header = False
self.in_table_header_field = False
self.table_fields = []
def handle_starttag(self, tag, attributes):
if tag == 'table':
for name, value in attributes:
if name == 'id' and value == 'data_table':
self.in_table = True
if self.in_table == True:
if tag == 'thead':
self.in_table_header = True
if self.in_table_header == True and tag == 'th':
self.in_table_header_field = True
def handle_endtag(self, tag):
if tag == 'table':
self.in_table = False
if tag == 'thead':
self.in_table_header = False
if tag == 'th':
self.in_table_header_field = False
def handle_data(self, data):
if self.in_table_header_field == True:
self.table_fields.append(data)
parser = TableParser()
parser.feed('<table id="data_table"><thead><tr><th>Company</th><th>Rapport</th><th>Price</th><th>Development 1år</th></thead></table>')
print(parser.table_fields)
这是输出:
['Company', 'Rapport', 'Price', 'Development 1', 'r']
我期待着:
['Company', 'Rapport', 'Price', 'Development 1år']
或更好:
['Company', 'Rapport', 'Price', 'Development 1år']
我做错了什么?
答案 0 :(得分:2)
您还需要为HTMLParser.handle_charref()
method添加处理程序:
def handle_charref(self, name):
self.handle_data(self.unescape('&#{};'.format(name)))
答案 1 :(得分:0)
使用lxml:
>>> import lxml.html
>>> root = lxml.html.fromstring('<table id="data_table"><thead><tr><th>Company</th><th>Rapport</th><th>Price</th><th>Development 1år</th></thead></table>')
>>> root.xpath('.//thead//th/text()')
['Company', 'Rapport', 'Price', 'Development 1år']