
时间:2015-02-27 11:21:08

标签: python html pandas beautifulsoup


from bs4 import BeautifulSoup
import urllib2
from lxml.html import fromstring 
import re
import csv
import pandas as pd

wiki = "http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)

    table = soup.find_all('table')[6]
except AttributeError as e:
    print 'No tables found, exiting'

    first = table.find_all('tr')[0]
except AttributeError as e:
    print 'No table row found, exiting'

    allRows = table.find_all('tr')[1:-1]
except AttributeError as e:
    print 'No table row found, exiting'

headers = [header.get_text() for header in first.find_all(['th', 'td'])]
results = [[data.get_text() for data in row.find_all(['th', 'td'])] for row in allRows]

df = pd.DataFrame(data=results, columns=headers)

我得到表作为输出..但对于行包含 rowspan - 的表我得到表如下 - enter image description here

4 个答案:

答案 0 :(得分:3)

在stackoverflow或网络上找到的解析器都没有为我工作 - 他们都错误地从维基百科解析了我的表。所以,你去,一个实际工作且简单的解析器。欢呼声。


def pre_process_table(table):
        1. table - a bs4 element that contains the desired table: ie <table> ... </table>
        a tuple of: 
            1. rows - a list of table rows ie: list of <tr>...</tr> elements
            2. num_rows - number of rows in the table
            3. num_cols - number of columns in the table
        include_td_head_count - whether to use only th or th and td to count number of columns (default: False)
    rows = [x for x in table.find_all('tr')]

    num_rows = len(rows)

    # get an initial column count. Most often, this will be accurate
    num_cols = max([len(x.find_all(['th','td'])) for x in rows])

    # sometimes, the tables also contain multi-colspan headers. This accounts for that:
    header_rows_set = [x.find_all(['th', 'td']) for x in rows if len(x.find_all(['th', 'td']))>num_cols/2]

    num_cols_set = []

    for header_rows in header_rows_set:
        num_cols = 0
        for cell in header_rows:
            row_span, col_span = get_spans(cell)


    num_cols = max(num_cols_set)

    return (rows, num_rows, num_cols)

def get_spans(cell):
            1. cell - a <td>...</td> or <th>...</th> element that contains a table cell entry
            1. a tuple with the cell's row and col spans
        if cell.has_attr('rowspan'):
            rep_row = int(cell.attrs['rowspan'])
        else: # ~cell.has_attr('rowspan'):
            rep_row = 1
        if cell.has_attr('colspan'):
            rep_col = int(cell.attrs['colspan'])
        else: # ~cell.has_attr('colspan'):
            rep_col = 1 

        return (rep_row, rep_col)

def process_rows(rows, num_rows, num_cols):
        1. rows - a list of table rows ie <tr>...</tr> elements
        1. data - a Pandas dataframe with the html data in it
    data = pd.DataFrame(np.ones((num_rows, num_cols))*np.nan)
    for i, row in enumerate(rows):
            col_stat = data.iloc[i,:][data.iloc[i,:].isnull()].index[0]
        except IndexError:
            print(i, row)

        for j, cell in enumerate(row.find_all(['td', 'th'])):
            rep_row, rep_col = get_spans(cell)

            #print("cols {0} to {1} with rep_col={2}".format(col_stat, col_stat+rep_col, rep_col))
            #print("\trows {0} to {1} with rep_row={2}".format(i, i+rep_row, rep_row))

            #find first non-na col and fill that one
            while any(data.iloc[i,col_stat:col_stat+rep_col].notnull()):

            data.iloc[i:i+rep_row,col_stat:col_stat+rep_col] = cell.getText()
            if col_stat<data.shape[1]-1:

    return data

def main(table):
    rows, num_rows, num_cols = pre_process_table(table)
    df = process_rows(rows, num_rows, num_cols)


## Find tables on the page and locate the desired one:
tables = soup.findAll("table", class_='wikitable')

## I want table 3 or the one that contains years 2000-2018
table = tables[3]

## run the above functions to extract the data
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)


如果是简单案例 - 更简单的解决方案

如果这是一个格式很好且具有rowspan属性的表格,则上述问题可能会有一个更简单的解决方案。 Pandas具有相当强大的read_html函数,可以解析提供的html表,并且似乎能很好地处理rowspan(无法解析威斯康星州的东西)。然后fillna(method='ffill')可以填充未填充的行。请注意,这不一定适用于列空间。另请注意,之后需要进行清理。


    s = """<table width="100%" border="1">
        <td rowspan="1">one</td>
        <td rowspan="2">two</td>
        <td rowspan="3">three</td>


In [16]: df = pd.read_html(s)[0]

In [29]: df
      0     1      2
0   one   two  three
1   "4"   NaN    NaN
2  "55"  "99"    NaN


In [30]: df.fillna(method='ffill')
      0     1      2
0   one   two  three
1   "4"   two  three
2  "55"  "99"  three

答案 1 :(得分:2)



     <td rowspan="2">2=</td>
     <td>West Indies</td>
     <td style="text-align:left;">India</td>


  1. 获取所有此类tr信息并保存在变量中。保存td标记的序列号,rowspan标记的序号,tr的值,即td个标记有多少td,文本值为{ {1}}。
  2. 根据上述方法更新所有tr的结果。
  3. 注意::仅检查给定的测试用例。需要检查更多测试用例。


    from bs4 import BeautifulSoup
    import urllib2
    from lxml.html import fromstring 
    import re
    import csv
    import pandas as pd
    wiki = "http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records"
    header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
    req = urllib2.Request(wiki,headers=header)
    page = urllib2.urlopen(req)
    soup = BeautifulSoup(page)
    table = soup.find_all('table')[6]
    tmp = table.find_all('tr')
    first = tmp[0]
    allRows = tmp[1:-1]
    headers = [header.get_text() for header in first.find_all('th')]
    results = [[data.get_text() for data in row.find_all('td')] for row in allRows]
    #<td rowspan="2">2=</td>
    # list of tuple (Level of tr, Level of td, total Count, Text Value)
    #[(1, 0, 2, u'2=')]
    # (<tr> is 1 , td sequence in tr is 0, reapted 2 times , value is 2=)
    rowspan = []
    for no, tr in enumerate(allRows):
        tmp = []
        for td_no, data in enumerate(tr.find_all('td')):
            print  data.has_key("rowspan")
            if data.has_key("rowspan"):
                rowspan.append((no, td_no, int(data["rowspan"]), data.get_text()))
    if rowspan:
        for i in rowspan:
            # tr value of rowspan in present in 1th place in results
            for j in xrange(1, i[2]):
                #- Add value in next tr.
                results[i[0]+j].insert(i[1], i[3])
    df = pd.DataFrame(data=results, columns=headers)
    print df


      Rank       Opponent No. wins Most recent venue Season
    0    1   South Africa        6            Lord's   1951
    1   2=    West Indies        4            Lord's   2009
    2   2=          India        4            Mumbai   2012
    3    4      Australia        3            Sydney   1932
    4    5       Pakistan        2      Trent Bridge   1967
    5    6      Sri Lanka        1      Old Trafford   2002


      Rank Hundreds            Player Matches Innings Average
    0    1       25     Alastair Cook     107     191   45.61
    1    2       23   Kevin Pietersen     104     181   47.28
    2    3       22     Colin Cowdrey     114     188   44.07
    3    3       22     Wally Hammond      85     140   58.46
    4    3       22  Geoffrey Boycott     108     193   47.72
    5    6       21    Andrew Strauss     100     178   40.91
    6    6       21          Ian Bell     103     178   45.30
    7   8=       20    Ken Barrington      82     131   58.67
    8   8=       20      Graham Gooch     118     215   42.58
    9   10       19        Len Hutton      79     138   56.67

答案 2 :(得分:2)

pandas> = 0.24.0可以理解the release notes中所述的colspanrowspan属性。要提取以前给您带来问题的wikipage表,请执行以下操作。

import pandas as pd

# Extract all tables from the wikipage
dfs = pd.read_html("http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records")
# The table referenced above is the 7th on the wikipage
df = dfs[6]
# The last row is just the date of the last update
df = df.iloc[:-1]


   Rank  Victories    Opposition                                 Most recent venue              Date
0     1          6  South Africa                           Lord's, London, England      21 June 1951
1    =2          4         India                   Wankhede Stadium, Mumbai, India  23 November 2012
2    =2          4   West Indies                           Lord's, London, England        6 May 2009
3     4          3     Australia          Sydney Cricket Ground, Sydney, Australia   2 December 1932
4     5          2      Pakistan                 Trent Bridge, Nottingham, England    10 August 1967
5     6          1     Sri Lanka  Old Trafford Cricket Ground, Manchester, England      13 June 2002

答案 3 :(得分:1)



<table width="100%" border="1">
        <td rowspan="2">one</td>
        <td colspan="2">February</td>



one  two         three
one  February    February


# !/bin/python3
# coding: utf-8
from bs4 import BeautifulSoup

class Element(object):
    def __init__(self, row, col, text, rowspan=1, colspan=1):
        self.row = row
        self.col = col
        self.text = text
        self.rowspan = rowspan
        self.colspan = colspan

    def __repr__(self):
        return f'''{{"row": {self.row}, "col":  {self.col}, "text": {self.text}, "rowspan": {self.rowspan}, "colspan": {self.colspan}}}'''

    def isRowspan(self):
        return self.rowspan > 1

    def isColspan(self):
        return self.colspan > 1

def parse(h) -> [[]]:
    doc = BeautifulSoup(h, 'html.parser')

    trs = doc.select('tr')

    m = []

    for row, tr in enumerate(trs):  # collect Node, rowspan node, colspan node
        it = []
        ts = tr.find_all(['th', 'td'])
        for col, tx in enumerate(ts):
            element = Element(row, col, tx.text.strip())
            if tx.has_attr('rowspan'):
                element.rowspan = int(tx['rowspan'])
            if tx.has_attr('colspan'):
                element.colspan = int(tx['colspan'])

    def solveColspan(ele):
        row, col, text, rowspan, colspan = ele.row, ele.col, ele.text, ele.rowspan, ele.colspan
        m[row].insert(col + 1, Element(row, col, text, rowspan, colspan - 1))
        for column in range(col + 1, len(m[row])):
            m[row][column].col += 1

    def solveRowspan(ele):
        row, col, text, rowspan, colspan = ele.row, ele.col, ele.text, ele.rowspan, ele.colspan
        offset = row + 1
        m[offset].insert(col, Element(offset, col, text, rowspan - 1, 1))
        for column in range(col + 1, len(m[offset])):
            m[offset][column].col += 1

    for row in m:
        for ele in row:
            if ele.isColspan():
            if ele.isRowspan():
    return m

def prettyPrint(m):
    for i in m:
        it = [f'{len(i)}']
        for index, j in enumerate(i):
            if j.text != '':
                it.append(f'{index:2} {j.text[:4]:4}')
        print(' --- '.join(it))

with open('./index.html', 'rb') as f:
    index = f.read()
html = index.decode('utf-8')
matrix = parse(html)