用ElementTree解析锚标签和tbody

时间:2016-11-19 09:42:28

标签: python python-3.x dictionary xpath elementtree

参考 - Iterate over python dictionary to retrieve only required rows

我的HTML正在由外部应用程序格式化,如下所示 - 当我使用以下代码

处理此HTML输入时
        from xml.etree import ElementTree as ET

        s = """<table class="darshan" style="width: 290px;">
    <thead>
        <tr>
            <th style="background-color: #efefef; width: 55px;">Release</th>
            <th style="background-color: #efefef; width: 63px;">REFDB</th>
            <th style="background-color: #efefef; width: 151px;">URL</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td style="width: 55px;">3.7.3</td>
            <td style="width: 63px;">
                <p>12345</p>
                <p>232323</p>
                <p>4343454</p>
                <p>5454554</p>
            </td>
            <td style="width: 151px;">
                <p><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a>
                </p>
                <p><a class="jive-link-external-small" href="http://test12213.com" rel="nofollow">http://test12213.com</a>
                </p>
            </td>
        </tr>
        <tr>
            <td style="width: 55px;">3.7.4</td>
            <td style="width: 63px;">
                <p>456789</p>
                <p>54545</p>
                <p>5454545</p>
                <p>545454</p>
            </td>
            <td style="width: 151px;"><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a>
            </td>
        </tr>
    </tbody>
</table>
        """
        def find_version(ver):
            table = ET.XML(s)
            rows = iter(table)
            headers = [col.text for col in next(rows)]
            for row in rows:
                values = [col.text for col in row]
                out = dict(zip(headers, values))
                if out['Release'] == ver:
                    return out

            return None

        res = find_version('3.7.3')
        if res:
            for x in res.items():
                print(' - '.join(x))
        else:
            print ('Version not found')

我得到以下输出:

trs: [<Element 'th' at 0x0431CDE0>, <Element 'th' at 0x0431CE40>, <Element 'th' at 0x0431CEA0>]
ths: []
tds: []
out: OrderedDict()
Traceback (most recent call last):
  File "parse_html.py", line 141, in <module>
    res = find_version(ver)
  File "parse_html.py", line 136, in find_version
    if out['Release'] == ver:
KeyError: 'Release'

2 个答案:

答案 0 :(得分:1)

没有评论 - 请参阅代码和print()结果。

from xml.etree import ElementTree as ET

s = '''<table>
    <tbody>
        <tr>
            <th>Release</th>
            <th>REFDB</th>
            <th>URL</th>
        </tr>
        <tr>
            <td>3.7.3</td>
            <td>12345</td>
            <td><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a>
            </td>
        </tr>
        <tr>
            <td>3.7.4</td>
            <td>456789</td>
            <td><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a>
            </td>
        </tr>
    </tbody>
</table>'''

# --- functions ---

def find_version(ver):
    table = ET.XML(s)

    #rows = iter(table)
    #trs = list(next(rows))
    trs = table.getchildren()[0].getchildren()
    print('trs:', trs)

    #ths = [th.text for th in iter(trs[0])]
    ths = [th.text for th in trs[0].getchildren()]
    print('ths:', ths)

    for tr in trs[1:]:
        tds = []
        #for col in iter(tr):
        for col in tr.getchildren():
            text = " ".join(col.itertext()).strip()
            tds.append(text)
        print('tds:', tds)

        out = dict(zip(ths, tds))
        print('out:', out)

        if out['Release'] == ver:
            return out

# --- main ---

res = find_version('3.7.3')

if res:
    for key, val in res.items():
        print(key, '-', val)
else:
    print ('Version not found')

结果

trs: [<Element 'tr' at 0x7f26d73005e8>, <Element 'tr' at 0x7f26d7300e08>, <Element 'tr' at 0x7f26d7300868>]
ths: ['Release', 'REFDB', 'URL']
tds: ['3.7.3', '12345', 'http://google.com']
out: {'URL': 'http://google.com', 'REFDB': '12345', 'Release': '3.7.3'}
URL - http://google.com
REFDB - 12345
Release - 3.7.3

编辑:新HTML的版本。

我滑过getchildren(),但它只使用索引的方式相同。而且它更短。但使用findall()

更具可读性
from xml.etree import ElementTree as ET

s = '''<table class="darshan" style="width: 290px;">
    <thead>
        <tr>
            <th style="background-color: #efefef; width: 55px;">Release</th>
            <th style="background-color: #efefef; width: 63px;">REFDB</th>
            <th style="background-color: #efefef; width: 151px;">URL</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td style="width: 55px;">3.7.3</td>
            <td style="width: 63px;">
                <p>12345</p>
                <p>232323</p>
                <p>4343454</p>
                <p>5454554</p>
            </td>
            <td style="width: 151px;">
                <p><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a>
                </p>
                <p><a class="jive-link-external-small" href="http://test12213.com" rel="nofollow">http://test12213.com</a>
                </p>
            </td>
        </tr>
        <tr>
            <td style="width: 55px;">3.7.4</td>
            <td style="width: 63px;">
                <p>456789</p>
                <p>54545</p>
                <p>5454545</p>
                <p>545454</p>
            </td>
            <td style="width: 151px;"><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a>
            </td>
        </tr>
    </tbody>
</table>'''

# --- functions ---

def find_version(ver):
    table = ET.XML(s)

    # headers
    #ths = [th.text for th in table[0][0]]
    ths = [th.text for th in table.findall('.//th')]
    #print('ths:', ths)

    # rows
    #for tr in table[1]:
    for tr in table.findall('.//tbody/tr'):
        data = []

        # first col
        data.append(tr[0].text)

        # second col
        data.append([x.text for x in tr[1]])

        # third col
        temp = []
        for x in tr[2]:
            if x.tag == 'a':
                temp.append(x.text)
            else:
                temp.append(x[0].text)
        data.append(temp)

        #print('data:', data)

        # dictionary
        out = dict(zip(ths, data))
        #print('out:', out)

        if out['Release'] == ver:
            return out

# --- main ---

res = find_version('3.7.3')

if res:
    for key, val in res.items():
        print(key, '-', val)
else:
    print ('Version not found')

结果

Release - 3.7.3
REFDB - ['12345', '232323', '4343454', '5454554']
URL - ['http://google.com', 'http://test12213.com']

答案 1 :(得分:0)

我将您的输入数据放在&#34; data.xml&#34;在同一个文件夹中,你可以尝试下面的代码,它可以在我的电脑上运行(它没有足够的异常检查,但它应该足以显示正确的方式)

from xml.etree import ElementTree as ET                                      

filename = "data.xml"                                                        
tree = ET.ElementTree(file=filename)                                         

def find_version(ver, tree):                                                 
    table = tree.getroot()                                               
    print table                                                          
    headers = []                                                         
    for tbody in table:                                                  
            for tr in tbody:                                             
                    values = []
                    value_flag = True                                    
                    for elem in tr.iter():                               
                            if elem.tag == 'th':                         
                                    headers.append(elem.text)            
                                    value_flag = False                   
                            elif elem.tag == 'td':                       
                                    if not elem.text:
                                            values.append(elem[0].text)  
                                    else:                                
                                            values.append(elem.text)     
                    if value_flag:                                       
                            out = dict(zip(headers, values))             
                            if out['Release'] == ver:
                                    return out

res = find_version('3.7.3', tree)
if res:
    print res
    for x in res.items():
            print(' - '.join(x))
else:
    print ('Version not found')

输出:

Release - 3.7.3
URL - http://google.com
REFDB - 12345