我正在尝试使用lxml
读取MathML文件import lxml.etree as et
#function that reads the xhtml file and returns the root of the document
def getData(fname):
f=open(fname)
d=f.read()
data = d.replace('\n','')
root = et.fromstring(data)
return root
def main():
r = getData('integral_example.xhtml')
print et.tostring(r)
if __name__ == '__main__':
main()
integral_example.xhtml:
<!-- begin MathToWeb -->
<!-- (your LaTeX) $ \int_a^b f(x)\,dx. $ -->
<!DOCTYPE math SYSTEM "http://www.w3.org/Math/DTD/mathml1/mathml.dtd">
<math>
<mrow>
<mstyle displaystyle="true">
<munderover>
<mo>∫</mo>
<mi>a</mi>
<mi>b</mi>
</munderover>
</mstyle>
<mi>f</mi>
<mrow>
<mo maxsize="1.00em" form="prefix">(</mo>
<mi>x</mi>
<mo maxsize="1.00em" form="postfix">)</mo>
</mrow>
<mspace width="0.167em" />
<mo>ⅆ</mo>
<mi>x</mi>
<mo>.</mo>
</mrow>
</math>
<!-- end MathToWeb -->
so_question.py是上面给出的python代码。我跑步时的输出如下:
Venkateshs-MacBook-Pro:mathml_examples venkatesh $ python so_question.py
Traceback (most recent call last):
File "so_question.py", line 14, in <module>
main()
File "so_question.py", line 11, in main
r = getData('integral_example.xhtml')
File "so_question.py", line 7, in getData
root = et.fromstring(data)
File "lxml.etree.pyx", line 3003, in lxml.etree.fromstring (src/lxml/lxml.etree.c:67277)
File "parser.pxi", line 1785, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:101626)
File "parser.pxi", line 1673, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:100455)
File "parser.pxi", line 1074, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:95637)
File "parser.pxi", line 582, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:90446)
File "parser.pxi", line 683, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:91632)
File "parser.pxi", line 622, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:90928)
lxml.etree.XMLSyntaxError: Entity 'int' not defined, line 1, column 197
我哪里错了?
答案 0 :(得分:0)
要从DTD加载实体,请在创建解析器时使用load_dtd=True
选项。同时set no_network=False
在查找外部文档时允许网络访问(在本例中为http://www.w3.org/Math/DTD/mathml1/mathml.dtd)。
import lxml.etree as ET
# function that reads the xhtml file and returns the root of the document
def getData(fname):
with open(fname) as f:
parser = ET.XMLParser(load_dtd=True, no_network=False)
doc = ET.parse(f, parser=parser)
return doc.getroot()
def main():
r = getData('integral_example.xhtml')
print ET.tostring(r)
if __name__ == '__main__':
main()
产量
<!DOCTYPE math SYSTEM "http://www.w3.org/Math/DTD/mathml1/mathml.dtd">
<math>
<mrow>
<mstyle displaystyle="true">
<munderover>
<mo>∫</mo>
<mi>a</mi>
<mi>b</mi>
</munderover>
</mstyle>
<mi>f</mi>
<mrow>
<mo maxsize="1.00em" form="prefix">(</mo>
<mi>x</mi>
<mo maxsize="1.00em" form="postfix">)</mo>
</mrow>
<mspace width="0.167em"/>
<mo></mo>
<mi>x</mi>
<mo>.</mo>
</mrow>
</math><!-- end MathToWeb -->
如果您不想解析实体,则可以使用
parser = ET.XMLParser(resolve_entities=False)
代替;它更快,但
<mo>∫</mo>
未更改为
<mo>∫</mo>