参考 - Iterate over python dictionary to retrieve only required rows
我的HTML正在由外部应用程序格式化,如下所示 - 当我使用以下代码
处理此HTML输入时 from xml.etree import ElementTree as ET
s = """<table class="darshan" style="width: 290px;">
<thead>
<tr>
<th style="background-color: #efefef; width: 55px;">Release</th>
<th style="background-color: #efefef; width: 63px;">REFDB</th>
<th style="background-color: #efefef; width: 151px;">URL</th>
</tr>
</thead>
<tbody>
<tr>
<td style="width: 55px;">3.7.3</td>
<td style="width: 63px;">
<p>12345</p>
<p>232323</p>
<p>4343454</p>
<p>5454554</p>
</td>
<td style="width: 151px;">
<p><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a>
</p>
<p><a class="jive-link-external-small" href="http://test12213.com" rel="nofollow">http://test12213.com</a>
</p>
</td>
</tr>
<tr>
<td style="width: 55px;">3.7.4</td>
<td style="width: 63px;">
<p>456789</p>
<p>54545</p>
<p>5454545</p>
<p>545454</p>
</td>
<td style="width: 151px;"><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a>
</td>
</tr>
</tbody>
</table>
"""
def find_version(ver):
table = ET.XML(s)
rows = iter(table)
headers = [col.text for col in next(rows)]
for row in rows:
values = [col.text for col in row]
out = dict(zip(headers, values))
if out['Release'] == ver:
return out
return None
res = find_version('3.7.3')
if res:
for x in res.items():
print(' - '.join(x))
else:
print ('Version not found')
我得到以下输出:
trs: [<Element 'th' at 0x0431CDE0>, <Element 'th' at 0x0431CE40>, <Element 'th' at 0x0431CEA0>]
ths: []
tds: []
out: OrderedDict()
Traceback (most recent call last):
File "parse_html.py", line 141, in <module>
res = find_version(ver)
File "parse_html.py", line 136, in find_version
if out['Release'] == ver:
KeyError: 'Release'
答案 0 :(得分:1)
没有评论 - 请参阅代码和print()
结果。
from xml.etree import ElementTree as ET
s = '''<table>
<tbody>
<tr>
<th>Release</th>
<th>REFDB</th>
<th>URL</th>
</tr>
<tr>
<td>3.7.3</td>
<td>12345</td>
<td><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a>
</td>
</tr>
<tr>
<td>3.7.4</td>
<td>456789</td>
<td><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a>
</td>
</tr>
</tbody>
</table>'''
# --- functions ---
def find_version(ver):
table = ET.XML(s)
#rows = iter(table)
#trs = list(next(rows))
trs = table.getchildren()[0].getchildren()
print('trs:', trs)
#ths = [th.text for th in iter(trs[0])]
ths = [th.text for th in trs[0].getchildren()]
print('ths:', ths)
for tr in trs[1:]:
tds = []
#for col in iter(tr):
for col in tr.getchildren():
text = " ".join(col.itertext()).strip()
tds.append(text)
print('tds:', tds)
out = dict(zip(ths, tds))
print('out:', out)
if out['Release'] == ver:
return out
# --- main ---
res = find_version('3.7.3')
if res:
for key, val in res.items():
print(key, '-', val)
else:
print ('Version not found')
结果
trs: [<Element 'tr' at 0x7f26d73005e8>, <Element 'tr' at 0x7f26d7300e08>, <Element 'tr' at 0x7f26d7300868>]
ths: ['Release', 'REFDB', 'URL']
tds: ['3.7.3', '12345', 'http://google.com']
out: {'URL': 'http://google.com', 'REFDB': '12345', 'Release': '3.7.3'}
URL - http://google.com
REFDB - 12345
Release - 3.7.3
编辑:新HTML的版本。
我滑过getchildren()
,但它只使用索引的方式相同。而且它更短。但使用findall()
from xml.etree import ElementTree as ET
s = '''<table class="darshan" style="width: 290px;">
<thead>
<tr>
<th style="background-color: #efefef; width: 55px;">Release</th>
<th style="background-color: #efefef; width: 63px;">REFDB</th>
<th style="background-color: #efefef; width: 151px;">URL</th>
</tr>
</thead>
<tbody>
<tr>
<td style="width: 55px;">3.7.3</td>
<td style="width: 63px;">
<p>12345</p>
<p>232323</p>
<p>4343454</p>
<p>5454554</p>
</td>
<td style="width: 151px;">
<p><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a>
</p>
<p><a class="jive-link-external-small" href="http://test12213.com" rel="nofollow">http://test12213.com</a>
</p>
</td>
</tr>
<tr>
<td style="width: 55px;">3.7.4</td>
<td style="width: 63px;">
<p>456789</p>
<p>54545</p>
<p>5454545</p>
<p>545454</p>
</td>
<td style="width: 151px;"><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a>
</td>
</tr>
</tbody>
</table>'''
# --- functions ---
def find_version(ver):
table = ET.XML(s)
# headers
#ths = [th.text for th in table[0][0]]
ths = [th.text for th in table.findall('.//th')]
#print('ths:', ths)
# rows
#for tr in table[1]:
for tr in table.findall('.//tbody/tr'):
data = []
# first col
data.append(tr[0].text)
# second col
data.append([x.text for x in tr[1]])
# third col
temp = []
for x in tr[2]:
if x.tag == 'a':
temp.append(x.text)
else:
temp.append(x[0].text)
data.append(temp)
#print('data:', data)
# dictionary
out = dict(zip(ths, data))
#print('out:', out)
if out['Release'] == ver:
return out
# --- main ---
res = find_version('3.7.3')
if res:
for key, val in res.items():
print(key, '-', val)
else:
print ('Version not found')
结果
Release - 3.7.3
REFDB - ['12345', '232323', '4343454', '5454554']
URL - ['http://google.com', 'http://test12213.com']
答案 1 :(得分:0)
我将您的输入数据放在&#34; data.xml&#34;在同一个文件夹中,你可以尝试下面的代码,它可以在我的电脑上运行(它没有足够的异常检查,但它应该足以显示正确的方式)
from xml.etree import ElementTree as ET
filename = "data.xml"
tree = ET.ElementTree(file=filename)
def find_version(ver, tree):
table = tree.getroot()
print table
headers = []
for tbody in table:
for tr in tbody:
values = []
value_flag = True
for elem in tr.iter():
if elem.tag == 'th':
headers.append(elem.text)
value_flag = False
elif elem.tag == 'td':
if not elem.text:
values.append(elem[0].text)
else:
values.append(elem.text)
if value_flag:
out = dict(zip(headers, values))
if out['Release'] == ver:
return out
res = find_version('3.7.3', tree)
if res:
print res
for x in res.items():
print(' - '.join(x))
else:
print ('Version not found')
输出:
Release - 3.7.3
URL - http://google.com
REFDB - 12345
〜