我使用Python3中的fitz库将XPS文件中的文本信息提取到XML中,以保留每个字符在文档中的位置。 我正在尝试将XML转换为Excel电子表格以供查看。
数据看起来像:
<page width="612" height="792">
<block bbox="122.28 93.597667 154.99869 157.9107">
<line bbox="124.799999 93.597667 154.99869 103.6707" wmode="0" dir="1 0">
<font name="Arial Regular" size="9.016425">
<char quad="124.799999 93.597667 127.305049 93.597667 124.799999 103.6707 127.305049 103.6707" x="124.799999" y="101.759998" c=" "/>
<char quad="127.30505 93.597667 129.8101 93.597667 127.30505 103.6707 129.8101 103.6707" x="127.30505" y="101.759998" c=" "/>
<char quad="129.8101 93.597667 132.31516 93.597667 129.8101 103.6707 132.31516 103.6707" x="129.8101" y="101.759998" c=" "/>
<char quad="132.31516 93.597667 134.8202 93.597667 132.31516 103.6707 134.8202 103.6707" x="132.31516" y="101.759998" c=" "/>
<char quad="134.82022 93.597667 137.32527 93.597667 134.82022 103.6707 137.32527 103.6707" x="134.82022" y="101.759998" c=" "/>
<char quad="137.32527 93.597667 142.33979 93.597667 137.32527 103.6707 142.33979 103.6707" x="137.32527" y="101.759998" c="1"/>
<char quad="142.33977 93.597667 147.35428 93.597667 142.33977 103.6707 147.35428 103.6707" x="142.33977" y="101.759998" c="8"/>
<char quad="147.47913 93.597667 152.49364 93.597667 147.47913 103.6707 152.49364 103.6707" x="147.47913" y="101.759998" c="2"/>
<char quad="152.49364 93.597667 154.99869 93.597667 152.49364 103.6707 154.99869 103.6707" x="152.49364" y="101.759998" c=" "/>
</font>
</line>
<line bbox="127.31999 104.39767 154.93863 114.4707" wmode="0" dir="1 0">
<font name="Arial Regular" size="9.016425">
<char quad="127.31999 104.39767 129.82505 104.39767 127.31999 114.4707 129.82505 114.4707" x="127.31999" y="112.56" c=" "/>
<char quad="129.82505 104.39767 132.3301 104.39767 129.82505 114.4707 132.3301 114.4707" x="129.82505" y="112.56" c=" "/>
<char quad="132.33011 104.39767 134.83516 104.39767 132.33011 114.4707 134.83516 114.4707" x="132.33011" y="112.56" c=" "/>
<char quad="134.83516 104.39767 137.34021 104.39767 134.83516 114.4707 137.34021 114.4707" x="134.83516" y="112.56" c=" "/>
<char quad="137.34021 104.39767 139.84526 104.39767 137.34021 114.4707 139.84526 114.4707" x="137.34021" y="112.56" c=" "/>
<char quad="139.84528 104.39767 142.35033 104.39767 139.84528 114.4707 142.35033 114.4707" x="139.84528" y="112.56" c=" "/>
<char quad="142.46004 104.39767 147.47455 104.39767 142.46004 114.4707 147.47455 114.4707" x="142.46004" y="112.56" c="2"/>
<char quad="147.41907 104.39767 152.43358 104.39767 147.41907 114.4707 152.43358 114.4707" x="147.41907" y="112.56" c="0"/>
<char quad="152.43358 104.39767 154.93863 104.39767 152.43358 114.4707 154.93863 114.4707" x="152.43358" y="112.56" c=" "/>
</font>
</line>
<line bbox="124.799999 115.317668 154.99869 125.3907" wmode="0" dir="1 0">
<font name="Arial Regular" size="9.016425">
<char quad="124.799999 115.317668 127.305049 115.317668 124.799999 125.3907 127.305049 125.3907" x="124.799999" y="123.479999" c=" "/>
<char quad="127.30505 115.317668 129.8101 115.317668 127.30505 125.3907 129.8101 125.3907" x="127.30505" y="123.479999" c=" "/>
<char quad="129.8101 115.317668 132.31516 115.317668 129.8101 125.3907 132.31516 125.3907" x="129.8101" y="123.479999" c=" "/>
<char quad="132.31516 115.317668 134.8202 115.317668 132.31516 125.3907 134.8202 125.3907" x="132.31516" y="123.479999" c=" "/>
<char quad="134.82022 115.317668 137.32527 115.317668 134.82022 125.3907 137.32527 125.3907" x="134.82022" y="123.479999" c=" "/>
<char quad="137.32527 115.317668 142.33979 115.317668 137.32527 125.3907 142.33979 125.3907" x="137.32527" y="123.479999" c="1"/>
<char quad="142.33977 115.317668 147.35428 115.317668 142.33977 125.3907 147.35428 125.3907" x="142.33977" y="123.479999" c="4"/>
<char quad="147.47913 115.317668 152.49364 115.317668 147.47913 125.3907 152.49364 125.3907" x="147.47913" y="123.479999" c="1"/>
<char quad="152.49364 115.317668 154.99869 115.317668 152.49364 125.3907 154.99869 125.3907" x="152.49364" y="123.479999" c=" "/>
</font>
</line>
<line bbox="122.28 126.11767 154.93267 136.1907" wmode="0" dir="1 0">
<font name="Arial Regular" size="9.016425">
<char quad="122.28 126.11767 124.78505 126.11767 122.28 136.1907 124.78505 136.1907" x="122.28" y="134.28" c=" "/>
<char quad="124.78505 126.11767 127.2901 126.11767 124.78505 136.1907 127.2901 136.1907" x="124.78505" y="134.28" c=" "/>
<char quad="127.2901 126.11767 129.79515 126.11767 127.2901 136.1907 129.79515 136.1907" x="127.2901" y="134.28" c=" "/>
<char quad="129.79517 126.11767 132.30022 126.11767 129.79517 136.1907 132.30022 136.1907" x="129.79517" y="134.28" c=" "/>
<char quad="132.30022 126.11767 137.31473 126.11767 132.30022 136.1907 137.31473 136.1907" x="132.30022" y="134.28" c="1"/>
<char quad="137.31472 126.11767 142.32923 126.11767 137.31472 136.1907 142.32923 136.1907" x="137.31472" y="134.28" c="4"/>
<char quad="142.45407 126.11767 147.46858 126.11767 142.45407 136.1907 147.46858 136.1907" x="142.45407" y="134.28" c="7"/>
<char quad="147.41312 126.11767 152.42763 126.11767 147.41312 136.1907 152.42763 136.1907" x="147.41312" y="134.28" c="2"/>
<char quad="152.42761 126.11767 154.93267 126.11767 152.42761 136.1907 154.93267 136.1907" x="152.42761" y="134.28" c=" "/>
</font>
</line>
<line bbox="122.28 136.91767 154.93267 146.9907" wmode="0" dir="1 0">
<font name="Arial Regular" size="9.016425">
<char quad="122.28 136.91767 124.78505 136.91767 122.28 146.9907 124.78505 146.9907" x="122.28" y="145.08" c=" "/>
<char quad="124.78505 136.91767 127.2901 136.91767 124.78505 146.9907 127.2901 146.9907" x="124.78505" y="145.08" c=" "/>
<char quad="127.2901 136.91767 129.79515 136.91767 127.2901 146.9907 129.79515 146.9907" x="127.2901" y="145.08" c=" "/>
<char quad="129.79517 136.91767 132.30022 136.91767 129.79517 146.9907 132.30022 146.9907" x="129.79517" y="145.08" c=" "/>
<char quad="132.30022 136.91767 137.31473 136.91767 132.30022 146.9907 137.31473 146.9907" x="132.30022" y="145.08" c="2"/>
<char quad="137.31472 136.91767 142.32923 136.91767 137.31472 146.9907 142.32923 146.9907" x="137.31472" y="145.08" c="9"/>
<char quad="142.45407 136.91767 147.46858 136.91767 142.45407 146.9907 147.46858 146.9907" x="142.45407" y="145.08" c="3"/>
<char quad="147.41312 136.91767 152.42763 136.91767 147.41312 146.9907 152.42763 146.9907" x="147.41312" y="145.08" c="4"/>
<char quad="152.42761 136.91767 154.93267 136.91767 152.42761 146.9907 154.93267 146.9907" x="152.42761" y="145.08" c=" "/>
</font>
</line>
<line bbox="124.799999 147.83768 154.99869 157.9107" wmode="0" dir="1 0">
<font name="Arial Regular" size="9.016425">
<char quad="124.799999 147.83768 127.305049 147.83768 124.799999 157.9107 127.305049 157.9107" x="124.799999" y="156" c=" "/>
<char quad="127.30505 147.83768 129.8101 147.83768 127.30505 157.9107 129.8101 157.9107" x="127.30505" y="156" c=" "/>
<char quad="129.8101 147.83768 132.31516 147.83768 129.8101 157.9107 132.31516 157.9107" x="129.8101" y="156" c=" "/>
<char quad="132.31516 147.83768 134.8202 147.83768 132.31516 157.9107 134.8202 157.9107" x="132.31516" y="156" c=" "/>
<char quad="134.82022 147.83768 137.32527 147.83768 134.82022 157.9107 137.32527 157.9107" x="134.82022" y="156" c=" "/>
<char quad="137.32527 147.83768 142.33979 147.83768 137.32527 157.9107 142.33979 157.9107" x="137.32527" y="156" c="7"/>
<char quad="142.33977 147.83768 147.35428 147.83768 142.33977 157.9107 147.35428 157.9107" x="142.33977" y="156" c="6"/>
<char quad="147.47913 147.83768 152.49364 147.83768 147.47913 157.9107 152.49364 157.9107" x="147.47913" y="156" c="4"/>
<char quad="152.49364 147.83768 154.99869 147.83768 152.49364 157.9107 154.99869 157.9107" x="152.49364" y="156" c=" "/>
</font>
</line>
</block>
我需要将其格式化为(当解析为excel电子表格时):
182 3%
20 0%
141 3%
1472 27%
2934 53%
764 14%
我尝试使用Python3 xml.etree.ElementTree库并遍历页面中的块。问题是我不知道如何格式化文本,以使连续的块准确地出现在它们所谓的x,y坐标中,因为还有其他块的x,y坐标离页面的一角很远。有谁知道如何保留格式?
答案 0 :(得分:0)
您可以使用ET.findall()
函数遍历每个记录(在 block 内的 line 和内的 char )。
在第二个循环中,您可以连接char.attrib['c']
中给出的(字符串)数字并将其保存在列表中。
要将值写入excel,您可以尝试openpyxl。
XMLPATH = "data.xml"
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
from openpyxl import Workbook
#parse xml-file
tree = ET.parse(XMLPATH)
root = tree.getroot()
#store values into list
#prepare SUM
column1 = list()
SUM = 0
for line in tree.findall('block/line'):
temp_charstring = str()
for char in line.findall('font/char'):
temp_charstring += str(char.attrib['c']) #-> ' 182 ', ' 20 ', ...
column1.append(temp_charstring.strip(' ')) #> ['182', '20', '141', ... ]
SUM += int(temp_charstring.strip(' '))
#create xlsx sheet
wb= Workbook()
ws = wb.active
#write into xlsx sheet
idx = 0
for i in (column1):
print(i)
idx+=1
ws['{cell}'.format(cell=('A'+str(idx)))] = int(i)
#--> ['A1'], ['A2'], ... = 182, 20, ...
ws['{cell}'.format(cell=('B'+str(idx)))] = int(i)/SUM*100
#--> ['B1'], ['B2'], ... = 3.30, 0.36, ...
#save xlsx sheet
wb.save("data.xlsx")