如果我有下面的字符串,我该如何吸引例如是<td>WSI_05</td> <td>
和</td>
之间的0.01?
<html xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:msxsl="urn:schemas-microsoft-com:xslt"> <head> <META http-equiv="Content-Type" content="text/html"> </head> <body style="margin:0px 0px 0px 0px;overflow:auto;background:#FFFFFF;"> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-collapse:collapse;padding:3px 3px 3px 3px"> <tr style="text-align:center;font-weight:bold;background:#9CBCE2"> <td>1</td> </tr> <tr> <td> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-spacing:0px; padding:3px 3px 3px 3px"> <tr> <td>Wweigh_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>FID</td> <td>0</td> </tr> <tr> <td>BAS34S_ID</td> <td>1</td> </tr> <tr bgcolor="#D4E4F3"> <td>ROUT_AREA</td> <td>28</td> </tr> <tr> <td>COUNTRY_NR</td> <td>304</td> </tr> <tr bgcolor="#D4E4F3"> <td>AVL6190_KM</td> <td>0.00002</td> </tr> <tr> <td>TOTWWD95_K</td> <td>0</td> </tr> <tr bgcolor="#D4E4F3"> <td>WTA_95</td> <td>0</td> </tr> <tr> <td>WSI_01</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_02</td> <td>0.01</td> </tr> <tr> <td>WSI_03</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_04</td> <td>0.01</td> </tr> <tr> <td>WSI_05</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_06</td> <td>0.01</td> </tr> <tr> <td>WSI_07</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_08</td> <td>0.01</td> </tr> <tr> <td>WSI_09</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_10</td> <td>0.01</td> </tr> <tr> <td>WSI_11</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_12</td> <td>0.01</td> </tr> <tr> <td>Avg_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>GeoM_mWSI</td> <td>0.01</td> </tr> <tr> <td>orig_WSI</td> <td>0.01</td> </tr> </table> </td> </tr> </table> </body> </html>
答案 0 :(得分:2)
您可以使用bs4.BeautifulSoup:
from bs4 import BeautifulSoup
soup = BeautifulSoup(my_html, 'html.parser')
all_td = [e.text for e in soup.find_all('td')]
[(t1, t2) for t1, t2 in zip(all_td, all_td[1:]) if t1.startswith('WSI_')]
输出:
[('WSI_01', '0.01'),
('WSI_02', '0.01'),
('WSI_03', '0.01'),
('WSI_04', '0.01'),
('WSI_05', '0.01'),
('WSI_06', '0.01'),
('WSI_07', '0.01'),
('WSI_08', '0.01'),
('WSI_09', '0.01'),
('WSI_10', '0.01'),
('WSI_11', '0.01'),
('WSI_12', '0.01')]
答案 1 :(得分:1)
import re
data = '<html xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:msxsl="urn:schemas-microsoft-com:xslt"> <head> <META http-equiv="Content-Type" content="text/html"> </head> <body style="margin:0px 0px 0px 0px;overflow:auto;background:#FFFFFF;"> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-collapse:collapse;padding:3px 3px 3px 3px"> <tr style="text-align:center;font-weight:bold;background:#9CBCE2"> <td>1</td> </tr> <tr> <td> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-spacing:0px; padding:3px 3px 3px 3px"> <tr> <td>Wweigh_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>FID</td> <td>0</td> </tr> <tr> <td>BAS34S_ID</td> <td>1</td> </tr> <tr bgcolor="#D4E4F3"> <td>ROUT_AREA</td> <td>28</td> </tr> <tr> <td>COUNTRY_NR</td> <td>304</td> </tr> <tr bgcolor="#D4E4F3"> <td>AVL6190_KM</td> <td>0.00002</td> </tr> <tr> <td>TOTWWD95_K</td> <td>0</td> </tr> <tr bgcolor="#D4E4F3"> <td>WTA_95</td> <td>0</td> </tr> <tr> <td>WSI_01</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_02</td> <td>0.01</td> </tr> <tr> <td>WSI_03</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_04</td> <td>0.01</td> </tr> <tr> <td>WSI_05</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_06</td> <td>0.01</td> </tr> <tr> <td>WSI_07</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_08</td> <td>0.01</td> </tr> <tr> <td>WSI_09</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_10</td> <td>0.01</td> </tr> <tr> <td>WSI_11</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_12</td> <td>0.01</td> </tr> <tr> <td>Avg_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>GeoM_mWSI</td> <td>0.01</td> </tr> <tr> <td>orig_WSI</td> <td>0.01</td> </tr> </table> </td> </tr> </table> </body> </html>'
results = re.findall('<td>WSI_05</td> <td>(.*?)</td>', data)
print(results)
输出:
['0.01']
答案 2 :(得分:1)
使用BeautifulSoup
例如:
from bs4 import BeautifulSoup
html = """<html xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:msxsl="urn:schemas-microsoft-com:xslt"> <head> <META http-equiv="Content-Type" content="text/html"> </head> <body style="margin:0px 0px 0px 0px;overflow:auto;background:#FFFFFF;"> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-collapse:collapse;padding:3px 3px 3px 3px"> <tr style="text-align:center;font-weight:bold;background:#9CBCE2"> <td>1</td> </tr> <tr> <td> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-spacing:0px; padding:3px 3px 3px 3px"> <tr> <td>Wweigh_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>FID</td> <td>0</td> </tr> <tr> <td>BAS34S_ID</td> <td>1</td> </tr> <tr bgcolor="#D4E4F3"> <td>ROUT_AREA</td> <td>28</td> </tr> <tr> <td>COUNTRY_NR</td> <td>304</td> </tr> <tr bgcolor="#D4E4F3"> <td>AVL6190_KM</td> <td>0.00002</td> </tr> <tr> <td>TOTWWD95_K</td> <td>0</td> </tr> <tr bgcolor="#D4E4F3"> <td>WTA_95</td> <td>0</td> </tr> <tr> <td>WSI_01</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_02</td> <td>0.01</td> </tr> <tr> <td>WSI_03</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_04</td> <td>0.01</td> </tr> <tr> <td>WSI_05</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_06</td> <td>0.01</td> </tr> <tr> <td>WSI_07</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_08</td> <td>0.01</td> </tr> <tr> <td>WSI_09</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_10</td> <td>0.01</td> </tr> <tr> <td>WSI_11</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_12</td> <td>0.01</td> </tr> <tr> <td>Avg_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>GeoM_mWSI</td> <td>0.01</td> </tr> <tr> <td>orig_WSI</td> <td>0.01</td> </tr> </table> </td> </tr> </table> </body> </html>"""
soup = BeautifulSoup(html, "html.parser")
print(soup.find('td', text='WSI_05').findNext('td').text)
# --> 0.01
答案 3 :(得分:1)
print re.findall('<td>([^<]*)</td>[^<]*<td>([^<]*)</td>', data)
输出
[('Wweigh_mWSI', '0.01'), ('FID', '0'), ('BAS34S_ID', '1'), ('ROUT_AREA', '28'), ('COUNTRY_NR', '304'), ('AVL6190_KM', '0.00002'), ('TOTWWD95_K', '0'), ('WTA_95', '0'), ('WSI_01', '0.01'), ('WSI_02', '0.01'), ('WSI_03', '0.01'), ('WSI_04', '0.01'), ('WSI_05', '0.01'), ('WSI_06', '0.01'), ('WSI_07', '0.01'), ('WSI_08', '0.01'), ('WSI_09', '0.01'), ('WSI_10', '0.01'), ('WSI_11', '0.01'), ('WSI_12', '0.01'), ('Avg_mWSI', '0.01'), ('GeoM_mWSI', '0.01'), ('orig_WSI', '0.01')]