这是我的代码:
# -*- coding: utf-8 -*-
from lxml import etree
from StringIO import StringIO
def str_repr(el, enc='utf-8'):
text = etree.tostring(el, pretty_print=True, method='xml', encoding=enc)
return text
if __name__ == "__main__":
txt = u'''<img width="280" style= "margin: 10px; float:left;" alt ="привет мир" src = "/[[template]]/image.jpg"><p>[[body]]</p>'''
#tree = etree.fromstring(txt)
parser = etree.HTMLParser()
tree = etree.parse(StringIO(txt), parser)
print str_repr(tree.getroot())#GOOD
path = "//img[@src]"
items = tree.xpath(path)
for l in items:
#y0 = l.attrib['alt']
#print y0
x0 = str_repr(l)#BAD
print x0
这部分:print str_repr(tree.getroot())#GOOD
打印出良好的编码结果(utf8)。
此部分:x0 = str_repr(l)#BAD
以未知编码打印结果。它打印:
<img width="280" style="margin: 10px;float:left;"alt="привет мир" src="
为什么呢?如何以相同的编码获得结果?