如何使用BeautifulSoup排除表格中的某些行?

时间:2019-06-13 03:16:30

标签: python web-scraping beautifulsoup

我已经从表格中获得了我想要的数据,但是我不希望各个球员统计信息(Rk,Pos,Name等)之间的缩写。如何在保留所需数据的同时排除那些?包含缩写的行被归类为“ thead”,但我无法弄清楚如何使用该信息来跳过它。我知道播放器的数据都压缩在一起,但是现在我只专注于打印我想要的数据,然后将它们分开。

import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

def main():
    # Url to scrape
    my_url = "https://www.baseball-reference.com/teams/MIL/2019.shtml"
    # Opening url
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    # Parsing page
    page_soup = soup(page_html, "html.parser")
    container = page_soup.find("div", id="div_team_batting")
    for player in container.find_all("tr"):
        print(player.text)

输出:

Rk
Pos
Name
Age
G
PA
AB
R
H
2B
3B
HR
RBI
SB
CS
BB
SO
BA
OBP
SLG
OPS
OPS+
TB
GDP
HBP
SH
SF
IBB

1CYasmani Grandal#30602442063258821436303450.282.381.544.92513811231031
21BJesus Aguilar2958183153132950423002343.190.294.301.595574681030
32BMike Moustakas*306026023644651402043302048.275.342.589.93113713984000
4SSOrlando Arcia24662502262958101828332350.257.324.416.740929470013
53BTravis Shaw*2945174150152650511002255.173.287.307.594564652002
6LFRyan Braun35612262092757901036411451.273.324.459.7841029652000
7CFLorenzo Cain33632882614166190427632145.253.313.372.684799773030
8RFChristian Yelich*2760263217527511225541413945.346.449.7601.209208165540312

Rk
Pos
Name
Age
G
PA
AB
R
H
2B
3B
HR
RBI
SB
CS
BB
SO
BA
OBP
SLG
OPS
OPS+
TB
GDP
HBP
SH
SF
IBB

91BEric Thames*3260172141273560826112760.248.378.461.8391196503011
10LFBen Gamel*2759169149233990416111649.262.343.403.746956003012
112BHernan Perez284812711918326051041835.269.315.445.760965340000
122BKeston Hiura (40-man)22176964818105910323.281.333.531.8651213402000
13CManny Pina3229625548202400516.145.242.291.533391612000
14UTJacob Nottingham (40-man)246551200140002.400.4001.0001.400247500000
15LFTyler Saladino (40-man)292220000000001.000.000.000.000-100000000

Rk
Pos
Name
Age
G
PA
AB
R
H
2B
3B
HR
RBI
SB
CS
BB
SO
BA
OBP
SLG
OPS
OPS+
TB
GDP
HBP
SH
SF
IBB

16PBrandon Woodruff*26163331110300400113.323.344.419.763991300100
17PZach Davies2614282713100100114.111.143.148.291-23400000
18PJhoulys Chacin (10-day IL)311116152300110016.200.250.400.65066600000
19PChase Anderson3111141400000000011.000.000.000.000-100000000
20PFreddy Peralta231013121200010005.167.167.167.333-12200100
21PGio Gonzalez (10-day IL)335970000000016.000.125.000.125-62000100
22PCorbin Burnes2417870200000002.286.286.286.57151200100
23PAdrian Houser2611220000000002.000.000.000.000-100000000
24PJunior Guerra3424220000000002.000.000.000.000-100000000
25PJosh Hader*2524110000000000.000.000.000.000-100000000
26PAaron Wilkerson (40-man)3031111001200001.0001.0004.0005.0001109400000
27PTaylor Williams# (40-man)277110000000001.000.000.000.000-100000000
28PBurch Smith (40-man)293110000000001.000.000.000.000-100000000
29PDonnie Hart* (40-man)284110000000000.000.000.000.000-100000000
30PJeremy Jeffress3119000000000000000000
31PAlex Claudio*2732000000000000000000
32PJay Jackson313000000000000000000
33PJimmy Nelson301000000000000000000
34PAlex Wilson3211000000000000000000
35PJake Petricka315000000000000000000
36PMatt Albers*3624000000000000000000
37PJacob Barnes2916000000000000000000
Team
Totals29.3672624231534058910951173364011259636.254.334.457.7921051059532741521
Rank in 15 NL teams26610121153146333288
Non-Pitcher     Totals29.5672494219333456810551153274011255573.259.341.469.8101091028532701521
Pitcher Totals27.067130122621402900463.172.198.254.453173100400

Rk
Pos
Name
Age
G
PA
AB
R
H
2B
3B
HR
RBI
SB
CS
BB
SO
BA
OBP
SLG
OPS
OPS+
TB
GDP
HBP
SH
SF
IBB

2 个答案:

答案 0 :(得分:1)

与代码保持一致,将选择器更改为使用:has,bs4 4.7.1。检查行中是否包含td元素,因为这将仅排除第(表头)行。我还切换到了按ID直接选择表格。

import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

def main():
    # Url to scrape
    my_url = "https://www.baseball-reference.com/teams/MIL/2019.shtml"
    # Opening url
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    # Parsing page
    page_soup = soup(page_html, "html.parser")
    #if you want one row of headers uncomment next two lines:
    #headers = [th.text for th in page_soup.select("#team_batting tr:nth-of-type(1) th")][:-2]
    #print(headers)
    for player in page_soup.select("#team_batting tr:has(td)"):
        print(' '.join([i.text for i in player.select('th, td')]))
        #I have chosen to join to give nice line based output but you could simply do another loop for i in player.select...... to get long print out

main()

附带说明:您可以抓取带有熊猫的表并将其索引到返回列表中,以获取格式精美的关注表:

import pandas as pd

tables  = pd.read_html('https://www.baseball-reference.com/teams/MIL/2019.shtml')
#e.g.
print(tables[10])

如果您想构建自己的数据框:

import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd

def main():
    # Url to scrape
    results = []
    my_url = "https://www.baseball-reference.com/teams/MIL/2019.shtml"
    # Opening url
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    # Parsing page
    page_soup = soup(page_html, "html.parser")
    headers = [th.text for th in page_soup.select("#team_batting tr:nth-of-type(1) th")][:-2]
    for player in page_soup.select("#team_batting tr:has(td)"):
        row = [i.text for i in player.select('th, td')]
        results.append(row)
    df = pd.DataFrame(results, columns = headers)
    print(df)

main()

答案 1 :(得分:1)

strip()的Python内置函数用于删除字符串中的所有前导和尾随空格。

import requests
from bs4 import BeautifulSoup

def main():

    my_url = "https://www.baseball-reference.com/teams/MIL/2019.shtml"
    response = requests.get(my_url)
    page_soup = BeautifulSoup(response.text, "html.parser")
    container = page_soup.find("table", id="team_batting")
    tbody = container.find("tbody")
    data = []

    for tr in tbody.find_all("tr"):
        _class = tr.get("class")
        player = {}

        #skip table body header
        if _class is not None and "thead" == _class[0]:
            continue

        #scrap player details
        for td in tr.find_all("td"):
            player[td['data-stat']] = td.text.strip()

            """ if you want to exclude specific data add if condition like 

                if td['data-stat'] == "age":
                    player['age'] = td.text.strip()


                # you can find td['data-stat'] attribute details if you inspect in table row columns

            """

        data.append(player)

    print(data)

main()