如果行有rowspan元素,如何使行对应于维基百科页面中的表。
from bs4 import BeautifulSoup
import urllib2
from lxml.html import fromstring
import re
import csv
import pandas as pd
wiki = "http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
try:
table = soup.find_all('table')[6]
except AttributeError as e:
print 'No tables found, exiting'
try:
first = table.find_all('tr')[0]
except AttributeError as e:
print 'No table row found, exiting'
try:
allRows = table.find_all('tr')[1:-1]
except AttributeError as e:
print 'No table row found, exiting'
headers = [header.get_text() for header in first.find_all(['th', 'td'])]
results = [[data.get_text() for data in row.find_all(['th', 'td'])] for row in allRows]
df = pd.DataFrame(data=results, columns=headers)
df
我得到表作为输出..但对于行包含 rowspan - 的表我得到表如下 -
答案 0 :(得分:3)
在stackoverflow或网络上找到的解析器都没有为我工作 - 他们都错误地从维基百科解析了我的表。所以,你去,一个实际工作且简单的解析器。欢呼声。
定义解析器函数:
def pre_process_table(table):
"""
INPUT:
1. table - a bs4 element that contains the desired table: ie <table> ... </table>
OUTPUT:
a tuple of:
1. rows - a list of table rows ie: list of <tr>...</tr> elements
2. num_rows - number of rows in the table
3. num_cols - number of columns in the table
Options:
include_td_head_count - whether to use only th or th and td to count number of columns (default: False)
"""
rows = [x for x in table.find_all('tr')]
num_rows = len(rows)
# get an initial column count. Most often, this will be accurate
num_cols = max([len(x.find_all(['th','td'])) for x in rows])
# sometimes, the tables also contain multi-colspan headers. This accounts for that:
header_rows_set = [x.find_all(['th', 'td']) for x in rows if len(x.find_all(['th', 'td']))>num_cols/2]
num_cols_set = []
for header_rows in header_rows_set:
num_cols = 0
for cell in header_rows:
row_span, col_span = get_spans(cell)
num_cols+=len([cell.getText()]*col_span)
num_cols_set.append(num_cols)
num_cols = max(num_cols_set)
return (rows, num_rows, num_cols)
def get_spans(cell):
"""
INPUT:
1. cell - a <td>...</td> or <th>...</th> element that contains a table cell entry
OUTPUT:
1. a tuple with the cell's row and col spans
"""
if cell.has_attr('rowspan'):
rep_row = int(cell.attrs['rowspan'])
else: # ~cell.has_attr('rowspan'):
rep_row = 1
if cell.has_attr('colspan'):
rep_col = int(cell.attrs['colspan'])
else: # ~cell.has_attr('colspan'):
rep_col = 1
return (rep_row, rep_col)
def process_rows(rows, num_rows, num_cols):
"""
INPUT:
1. rows - a list of table rows ie <tr>...</tr> elements
OUTPUT:
1. data - a Pandas dataframe with the html data in it
"""
data = pd.DataFrame(np.ones((num_rows, num_cols))*np.nan)
for i, row in enumerate(rows):
try:
col_stat = data.iloc[i,:][data.iloc[i,:].isnull()].index[0]
except IndexError:
print(i, row)
for j, cell in enumerate(row.find_all(['td', 'th'])):
rep_row, rep_col = get_spans(cell)
#print("cols {0} to {1} with rep_col={2}".format(col_stat, col_stat+rep_col, rep_col))
#print("\trows {0} to {1} with rep_row={2}".format(i, i+rep_row, rep_row))
#find first non-na col and fill that one
while any(data.iloc[i,col_stat:col_stat+rep_col].notnull()):
col_stat+=1
data.iloc[i:i+rep_row,col_stat:col_stat+rep_col] = cell.getText()
if col_stat<data.shape[1]-1:
col_stat+=rep_col
return data
def main(table):
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)
return(df)
以下是如何在此Wisconsin数据上使用上述代码的示例。假设它已经在bs4
汤中了......
## Find tables on the page and locate the desired one:
tables = soup.findAll("table", class_='wikitable')
## I want table 3 or the one that contains years 2000-2018
table = tables[3]
## run the above functions to extract the data
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)
上面的解析器将准确地解析诸如here之类的表,而其他所有表都无法在多个点重新创建表。
如果这是一个格式很好且具有rowspan
属性的表格,则上述问题可能会有一个更简单的解决方案。 Pandas
具有相当强大的read_html
函数,可以解析提供的html
表,并且似乎能很好地处理rowspan
(无法解析威斯康星州的东西)。然后fillna(method='ffill')
可以填充未填充的行。请注意,这不一定适用于列空间。另请注意,之后需要进行清理。
考虑一下html代码:
s = """<table width="100%" border="1">
<tr>
<td rowspan="1">one</td>
<td rowspan="2">two</td>
<td rowspan="3">three</td>
</tr>
<tr><td>"4"</td></tr>
<tr>
<td>"55"</td>
<td>"99"</td>
</tr>
</table>
"""
为了将其处理为请求的输出,只需执行:
In [16]: df = pd.read_html(s)[0]
In [29]: df
Out[29]:
0 1 2
0 one two three
1 "4" NaN NaN
2 "55" "99" NaN
然后填写NA,
In [30]: df.fillna(method='ffill')
Out[30]:
0 1 2
0 one two three
1 "4" two three
2 "55" "99" three
答案 1 :(得分:2)
由于以下情况导致的问题,如您所知,
html内容:
<tr>
<td rowspan="2">2=</td>
<td>West Indies</td>
<td>4</td>
<td>Lord's</td>
<td>2009</td>
</tr>
<tr>
<td style="text-align:left;">India</td>
<td>4</td>
<td>Mumbai</td>
<td>2012</td>
</tr>
因此当td
具有rowspan
属性时,请考虑在同一级别为td
的下一个tr
重复相同的rowspan
值,tr
的值意味着下一个rowspan
标签数量。
tr
信息并保存在变量中。保存td
标记的序列号,rowspan
标记的序号,tr
的值,即td
个标记有多少td
,文本值为{ {1}}。tr
的结果。注意::仅检查给定的测试用例。需要检查更多测试用例。
代码:
from bs4 import BeautifulSoup
import urllib2
from lxml.html import fromstring
import re
import csv
import pandas as pd
wiki = "http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
table = soup.find_all('table')[6]
tmp = table.find_all('tr')
first = tmp[0]
allRows = tmp[1:-1]
#table.find_all('tr')[1:-1]
headers = [header.get_text() for header in first.find_all('th')]
results = [[data.get_text() for data in row.find_all('td')] for row in allRows]
#<td rowspan="2">2=</td>
# list of tuple (Level of tr, Level of td, total Count, Text Value)
#e.g.
#[(1, 0, 2, u'2=')]
# (<tr> is 1 , td sequence in tr is 0, reapted 2 times , value is 2=)
rowspan = []
for no, tr in enumerate(allRows):
tmp = []
for td_no, data in enumerate(tr.find_all('td')):
print data.has_key("rowspan")
if data.has_key("rowspan"):
rowspan.append((no, td_no, int(data["rowspan"]), data.get_text()))
if rowspan:
for i in rowspan:
# tr value of rowspan in present in 1th place in results
for j in xrange(1, i[2]):
#- Add value in next tr.
results[i[0]+j].insert(i[1], i[3])
df = pd.DataFrame(data=results, columns=headers)
print df
输出:
Rank Opponent No. wins Most recent venue Season
0 1 South Africa 6 Lord's 1951
1 2= West Indies 4 Lord's 2009
2 2= India 4 Mumbai 2012
3 4 Australia 3 Sydney 1932
4 5 Pakistan 2 Trent Bridge 1967
5 6 Sri Lanka 1 Old Trafford 2002
也在表10中工作
Rank Hundreds Player Matches Innings Average
0 1 25 Alastair Cook 107 191 45.61
1 2 23 Kevin Pietersen 104 181 47.28
2 3 22 Colin Cowdrey 114 188 44.07
3 3 22 Wally Hammond 85 140 58.46
4 3 22 Geoffrey Boycott 108 193 47.72
5 6 21 Andrew Strauss 100 178 40.91
6 6 21 Ian Bell 103 178 45.30
7 8= 20 Ken Barrington 82 131 58.67
8 8= 20 Graham Gooch 118 215 42.58
9 10 19 Len Hutton 79 138 56.67
答案 2 :(得分:2)
pandas> = 0.24.0可以理解the
release
notes中所述的colspan
和rowspan
属性。要提取以前给您带来问题的wikipage表,请执行以下操作。
import pandas as pd
# Extract all tables from the wikipage
dfs = pd.read_html("http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records")
# The table referenced above is the 7th on the wikipage
df = dfs[6]
# The last row is just the date of the last update
df = df.iloc[:-1]
出局:
Rank Victories Opposition Most recent venue Date
0 1 6 South Africa Lord's, London, England 21 June 1951
1 =2 4 India Wankhede Stadium, Mumbai, India 23 November 2012
2 =2 4 West Indies Lord's, London, England 6 May 2009
3 4 3 Australia Sydney Cricket Ground, Sydney, Australia 2 December 1932
4 5 2 Pakistan Trent Bridge, Nottingham, England 10 August 1967
5 6 1 Sri Lanka Old Trafford Cricket Ground, Manchester, England 13 June 2002
答案 3 :(得分:1)
输入:
<html>
<body>
<table width="100%" border="1">
<tr>
<td rowspan="2">one</td>
<td>two</td>
<td>three</td>
</tr>
<tr>
<td colspan="2">February</td>
</tr>
</table>
</body>
</html>
输出:
one two three
one February February
python代码:
# !/bin/python3
# coding: utf-8
from bs4 import BeautifulSoup
class Element(object):
def __init__(self, row, col, text, rowspan=1, colspan=1):
self.row = row
self.col = col
self.text = text
self.rowspan = rowspan
self.colspan = colspan
def __repr__(self):
return f'''{{"row": {self.row}, "col": {self.col}, "text": {self.text}, "rowspan": {self.rowspan}, "colspan": {self.colspan}}}'''
def isRowspan(self):
return self.rowspan > 1
def isColspan(self):
return self.colspan > 1
def parse(h) -> [[]]:
doc = BeautifulSoup(h, 'html.parser')
trs = doc.select('tr')
m = []
for row, tr in enumerate(trs): # collect Node, rowspan node, colspan node
it = []
ts = tr.find_all(['th', 'td'])
for col, tx in enumerate(ts):
element = Element(row, col, tx.text.strip())
if tx.has_attr('rowspan'):
element.rowspan = int(tx['rowspan'])
if tx.has_attr('colspan'):
element.colspan = int(tx['colspan'])
it.append(element)
m.append(it)
def solveColspan(ele):
row, col, text, rowspan, colspan = ele.row, ele.col, ele.text, ele.rowspan, ele.colspan
m[row].insert(col + 1, Element(row, col, text, rowspan, colspan - 1))
for column in range(col + 1, len(m[row])):
m[row][column].col += 1
def solveRowspan(ele):
row, col, text, rowspan, colspan = ele.row, ele.col, ele.text, ele.rowspan, ele.colspan
offset = row + 1
m[offset].insert(col, Element(offset, col, text, rowspan - 1, 1))
for column in range(col + 1, len(m[offset])):
m[offset][column].col += 1
for row in m:
for ele in row:
if ele.isColspan():
solveColspan(ele)
if ele.isRowspan():
solveRowspan(ele)
return m
def prettyPrint(m):
for i in m:
it = [f'{len(i)}']
for index, j in enumerate(i):
if j.text != '':
it.append(f'{index:2} {j.text[:4]:4}')
print(' --- '.join(it))
with open('./index.html', 'rb') as f:
index = f.read()
html = index.decode('utf-8')
matrix = parse(html)
prettyPrint(matrix)