string = """<div property="gr:description" content="<P><TABLE border=0 cellSpacing=0
cellPadding=0 width=605><COLGROUP><COL width=605></COL></COLGROUP><TBODY><TR height=121><TD class=xl66 height=121 width=605>• Open Dimensions: 29.25”H x 71.71”W x 29.38”D<BR>• Black blow mold top<BR>• Durable, easy-to-clean top surface<BR>• Table has centerfold and convenient handle for easy transport and storage. <BR>• Compact flat fold<BR>• Strong steel frame.<BR>• Multi purpose – can be used for dining, work, crafts, etc.<BR>• Great for parties, weddings and holidays – indoors & out.</TD></TR></TBODY></TABLE></P>" xml:lang="en"></div>"""
解码此问题的最佳方法是什么?它在堆栈溢出输出中显示正常
答案 0 :(得分:0)
from htmlentitydefs import name2codepoint
import re
def unescape(s):
"unescape HTML code refs;"
return re.sub('&(%s);' % '|'.join(name2codepoint),
lambda m: unichr(name2codepoint[m.group(1)]), s)
>>> unescape(string)
u'<div property="gr:description" content="<P><TABLE border=0 cellSpacing=0 \ncellPadding=0 width=605><COLGROUP><COL width=605></COL></COLGROUP><TBODY><TR height=121><TD class=xl66 height=121 width=605>\u2022 Open Dimensions:\xa0 29.25\u201dH x 71.71\u201dW x 29.38\u201dD<BR>\u2022 Black blow mold top<BR>\u2022 Durable, easy-to-clean top surface<BR>\u2022 Table has centerfold and convenient handle for easy transport and storage.\xa0 <BR>\u2022 Compact flat fold<BR>\u2022 Strong steel frame.<BR>\u2022 Multi purpose \u2013 can be used for dining, work, crafts, etc.<BR>\u2022 Great for parties, weddings and holidays \u2013 indoors & out.</TD></TR></TBODY></TABLE></P>" xml:lang="en"></div>'
答案 1 :(得分:0)
我使用html5lib来解析HTML字符串。
from html5lib import treebuilders, treewalkers
_html = "<your html string>"
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
walker = treewalkers.getTreeWalker("dom")
dom = parser.parse(_html)
stream = walker(dom)
for element in stream:
print element # just a debug output