如何使用beautifulsoup4遍历HTML中的表并将某些值存储在数组中

时间:2018-07-11 15:12:22

标签: python html beautifulsoup

您将如何使用for循环存储参数的值,以便像这样打印结果:

焦点阈值结果:

{'Centre_Threhold:': 'max val', 'min val:': 'Corner Threshold:', 'max val': 'min val'}

下面是我正在使用的表的一部分,也是我正在测试的Python脚本。

HTML:

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Threholds:</td>
</tr>

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Focus_Threhold:</td>
</tr>

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Centre_Threhold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Corner_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;MTF_Threshold:</td>
</tr>

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Centre_Threhold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Corner_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Centration_Threhold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;XY_Threshold:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Rotation_Threshold:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Color_Bar_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Threshold:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Color_Chart_Threhold:</td>
</tr>

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Hue_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Luminance_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
    <td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>

Python:

focus_threshold_results = {}
for label in all_labels:
    if 'Focus_' in label.text:
        for sib in label.parent.next_siblings:
            if isinstance(sib, Tag):
                lab = sib.find('td', class_='label').text.strip()
                val = sib.find('td', class_='value').text.strip()
                focus_threshold_results[lab] = val
                if 'Maximum' in lab:
                    break
        break
print("Focus Threshold Results:    " + str(focus_threshold_results))

2 个答案:

答案 0 :(得分:0)

def set_res(res, last_labs, val):
    str = 'res'
    for l in last_labs:
        lstr = str
        str += "['%s']"%l
        exec("if '%s' not in %s: %s = {}" % (l,lstr,str))
    exec('%s = %s' % (str, val))

all_labels = soup.find_all("td", class_="label")
focus_threshold_results = {}
last_labs = []
for label in all_labels:
    if 'Focus_' in label.text:
        for sib in label.parent.next_siblings:
            if isinstance(sib, Tag):
                lab = sib.find('td', class_='label').text.strip()
                if 'MTF_Threshold' in lab: break
                valtd = sib.find('td', class_='value')
                last_labs.append(lab)
                if valtd:
                    val = valtd.text.strip()
                    set_res(focus_threshold_results, last_labs, val)
                    rem = -2 if 'Maximum' in lab else -1
                    last_labs = last_labs[:rem]
        break
print("Focus Threshold Results:    " + str(focus_threshold_results))

答案 1 :(得分:0)

首先,在处理之前清理HTML!您在Centre_Threhold中有错别字-应该是Threshold。我更改了HTML中的最小值和最大值的值,以查看结果:

s = """
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Threholds:</td>
</tr>

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Focus_Threhold:</td>
</tr>

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Centre_Threhold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>10</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Corner_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;MTF_Threshold:</td>
</tr>

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Centre_Threhold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Corner_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>4</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>5</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Centration_Threhold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;XY_Threshold:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Rotation_Threshold:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Color_Bar_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Threshold:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Color_Chart_Threhold:</td>
</tr>

<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Hue_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' colspan='2' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Luminance_Threshold:</td>
</tr>

<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Minimum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
<tr><td class='label' style="border-color:#000000;background-color:#FFFFFF;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Maximum:</td>
<td class='value' style="border-color:#000000;background-color:#32FFFF;"><span style='white-space:nowrap;'>0</span></td></tr>
"""

from bs4 import BeautifulSoup as bs
from bs4 import Tag
from pprint import pprint
import re

b = bs(s, 'lxml')

min_center_threshold = 0
max_center_threshold = 0

min_corner_threshold = 0
max_corner_threshold = 0

focus_threshold_results = {}
for tr in b.find_all('tr'):
    for c in tr.find_all('td', text=re.compile('.*?(Centre_Threhold)')):    # You have typo here (treShold)!
        minimum_value = int(c.find_next('td', text=re.compile('.*?(Minimum)')).find_next('td').text)
        maximum_value = int(c.find_next('td', text=re.compile('.*?(Maximum)')).find_next('td').text)

        min_center_threshold = min(min_center_threshold, minimum_value)
        max_center_threshold = max(max_center_threshold, maximum_value)

    for c in tr.find_all('td', text=re.compile('.*?(Corner_Threshold)')):
        minimum_value = int(c.find_next('td', text=re.compile('.*?(Minimum)')).find_next('td').text)
        maximum_value = int(c.find_next('td', text=re.compile('.*?(Maximum)')).find_next('td').text)

        min_corner_threshold = min(min_corner_threshold, minimum_value)
        max_corner_threshold = max(max_corner_threshold, maximum_value)

print(f"Focus Threshold: Centre_Threshold: ('{min_center_threshold}', '{max_center_threshold}'), Corner_Threshold: ('{min_corner_threshold}', '{max_corner_threshold}')")

输出:

Focus Threshold: Centre_Threshold: ('0', '10'), Corner_Threshold: ('0', '5')