我是使用python进行webscraping的新手,我希望使用team pages在baseball-reference.com上导入open source code from github上的所有表格。我遇到的问题是刮刀会拿起球队投球和球队击球表,而不是任何其他球员。我查看了网页上的html,发现fielding表是硬编码的,因此代码中不存在问题。我将brscraper.py代码稍微更改为错误检查,并且看到字典中的唯一表格是团队击球和团队投球。我也尝试取出所有的条件,并导入没有if语句的表,但没有改变。我很好奇,如果有人知道为什么美味汤不会立即找到所有的表。
这是我用于webscraping的代码,它找到表/标题然后将它们拉到一起。
from bs4 import BeautifulSoup
import urllib2
def removeExtraCharacters(s):
if s.endswith('*'):
s = s[0:-1]
if s.endswith('#'):
s = s[0:-1]
return s
class BRScraper:
def __init__(self, server_url="http://www.baseball-reference.com/"):
self.server_url = server_url
def parse_tables(self, resource, table_ids=None, verbose=False):
"""
Given a resource on the baseball-reference server (should consist of
the url after the hostname and slash), returns a dictionary keyed on
table id containing arrays of data dictionaries keyed on the header
columns. table_ids is a string or array of strings that can optionally
be used to filter out which stats tables to return.
"""
def is_parseable_table(tag):
if not tag.has_attr("class"): return False
return tag.name == "table" and "stats_table" in tag["class"] and "sortable" in tag["class"]
def is_parseable_row(tag):
if not tag.name == "tr": return False
if not tag.has_attr("class"): return True # permissive
return "league_average_table" not in tag["class"] and "stat_total" not in tag["class"]
if isinstance(table_ids, str): table_ids = [table_ids]
soup = BeautifulSoup(urllib2.urlopen(self.server_url + resource), "lxml")
tables = soup.find_all("table")
data = {}
# Read through each table, read headers as dictionary keys
for table in tables:
if table_ids != None and table["id"] not in table_ids: continue
if verbose: print "Processing table " + table["id"]
data[table["id"]] = []
headers = table.find("thead").find_all("th")
header_names = []
for header in headers:
if header.string == None:
base_header_name = ""
else: base_header_name = header.string.strip()
if base_header_name in header_names:
i = 1
header_name = base_header_name + "_" + str(i)
while header_name in header_names:
i += 1
header_name = base_header_name + "_" + str(i)
if verbose:
if base_header_name == "":
print "Empty header relabeled as %s" % header_name
else:
print "Header %s relabeled as %s" % (base_header_name, header_name)
else:
header_name = base_header_name
header_names.append(header_name)
header_names.pop(0)
rows = table.find("tbody").find_all(is_parseable_row)
for row in rows:
entries = row.find_all("td")
entry_data = []
for entry in entries:
# if entry.string == None:
# entry_data.append("")
# else:
entry_data.append(removeExtraCharacters(entry.text.strip()))
if len(entry_data) > 0:
data[table["id"]].append(dict(zip(header_names, entry_data)))
return data
这是我的实施:
import brscraper
def encodeAscii(data_table):
for i in range(len(data_table)):
for j in range(len(data_table[0])):
if data_table[i][j] is not None:
data_table[i][j] = data_table[i][j].encode("ascii")
if __name__ == "__main__":
scraper = brscraper.BRScraper()
resources = [["teams/ARI/2016.shtml", "teams/ATL/2016.shtml",
"teams/BAL/2016.shtml", "teams/BOS/2016.shtml", "teams/CHC/2016.shtml",
"teams/CHW/2016.shtml", "teams/CIN/2016.shtml", "teams/CLE/2016.shtml",
"teams/COL/2016.shtml", "teams/DET/2016.shtml", "teams/HOU/2016.shtml",
"teams/KCR/2016.shtml", "teams/LAA/2016.shtml", "teams/LAD/2016.shtml",
"teams/MIA/2016.shtml", "teams/MIL/2016.shtml", "teams/MIN/2016.shtml",
"teams/NYM/2016.shtml", "teams/NYY/2016.shtml", "teams/OAK/2016.shtml",
"teams/PHI/2016.shtml", "teams/PIT/2016.shtml", "teams/SDP/2016.shtml",
"teams/SFG/2016.shtml", "teams/SEA/2016.shtml", "teams/STL/2016.shtml",
"teams/TBR/2016.shtml", "teams/TEX/2016.shtml", "teams/TOR/2016.shtml",
"teams/WSN/2016.shtml"]]
teams ={'ARI': 'Arizona Diamondbacks', 'ATL': 'Atlanta Braves',
'BOS': 'Boston Red Sox', 'CHC': 'Chicago Cubs',
'CHW': 'Chicago White Sox', 'CIN': 'Cincinnati Reds',
'CLE': 'Cleveland Indians', 'COL': 'Colorado Rockies',
'DET': 'Detroid Tigers', 'HOU': 'Houston Astros',
'KCR': 'Kansas City Royals', 'LAA': 'Los Angeles Angels of Anaheim',
'LAD': 'Los Angeles Dodgers', 'MIA': 'Miami Marlins',
'MIL': 'Milwaukee Brewers', 'MIN': 'Minnesota Twins',
'NYM': 'New York Mets', 'NYY': 'New York Yankees',
'OAK': 'Oakland Athletics', 'PHI': 'Philadelphia Phillies',
'PIT': 'Pittsbugh Pirates', 'SDP': 'San Diego Padres',
'SFG': 'San Francisco Giants', 'SEA': 'Seattle Mariners',
'STL': 'St. Louis Cardinals', 'TBR': 'Tampa Bay Rays',
'TEX': 'Texas Rangers', 'TOR': 'Toronto Blue Jays',
'WSN': 'Washington Nationals', 'BAL': 'Baltimore Orioles'}
# all_orders contains information about the order of the data
# all_orders is what the headers of our tables should be
# all_data is the data itself
# all_data and all_orders will both have the same keys
# So all_data['batting'] = the 2D array of batting data
# all_orders['batting'] = the headers for each column of batting data
all_data = {}
all_orders = {}
team_batting_list = ["name", "team_name", "at_bats", "hits", "homeruns",
"runs_batted_in", "stolen_bases"]
team_pitching_list = ["name", "team_name", "wins", "losses", "saves",
"innings_pitched", "earned_run_average", "strikeouts"]
team_fielding_list = ["name", "team_name", "position", "games_played",
"put_outs", "assists", "errors", "fielding_percentage"]
team_batting = []
team_pitching = []
team_fielding = []
for team_name in resources[0]:
print team_name
data = scraper.parse_tables(team_name)
print data.keys()
###########
# BATTING #
###########
if "team_batting" in data.keys():
for row in data["team_batting"]:
team_batting.append([])
team_batting[-1].append(row["Name"])
team_batting[-1].append(teams[team_name[6:9]])
team_batting[-1].append(row["AB"])
team_batting[-1].append(row["H"])
team_batting[-1].append(row["HR"])
team_batting[-1].append(row["RBI"])
team_batting[-1].append(row["SB"])
############
# PITCHING #
############
if "team_pitching" in data.keys():
for row in data["team_pitching"]:
team_pitching.append([])
team_pitching[-1].append(row["Name"])
team_pitching[-1].append(teams[team_name[6:9]])
team_pitching[-1].append(row["W"])
team_pitching[-1].append(row["L"])
team_pitching[-1].append(row["SV"])
team_pitching[-1].append(row["ERA"])
team_pitching[-1].append(row["SO"])
############
# FIELDING #
############
if "team_fielding" in data.keys():
for row in data["team_fielding"]:
team_fielding.append([])
team_fielding[-1].append(row["Name"])
team_fielding[-1].append(teams[team_name[6:9]])
team_fielding[-1].append(row["Pos"])
team_fielding[-1].append(row["G"])
team_fielding[-1].append(row["PO"])
team_fielding[-1].append(row["A"])
team_fielding[-1].append(row["E"])
team_fielding[-1].append(row["Fld%"])
encodeAscii(team_batting)
encodeAscii(team_pitching)
encodeAscii(team_fielding)
all_data['pitching'] = team_pitching
all_orders['pitching'] = team_pitching_list
all_data['batting'] = team_batting
all_orders['batting'] = team_batting_list
all_data['fielding'] = team_fielding
all_orders['fielding'] = team_fielding_list
print all_data['fielding']
答案 0 :(得分:0)
这会将表格作为列表列表输出,您可以根据需要进行格式化,此表格中也不包括标题:
r = urllib2.urlopen('http://www.baseball-reference.com/teams/ARI/2016.shtml')
raw_data = [[cell.text for cell in row('td')] for row in BeautifulSoup(r, 'html.parser')('tr')]
>>> [[],
[u'C',
u'Welington Castillo',
u'29',
u'113',
u'457',
u'416',
u'41',
u'110',
u'24',
u'0',
u'14',
u'68',
u'2',
u'0',
u'33',
u'121',
u'.264',
u'.322',
u'.423',
u'.745',
u'93',
u'176',
u'5',
u'4',
u'0',
u'4',
u'3'],
...]
答案 1 :(得分:0)
因此,在stackoverflow上的其他帖子中已经解决的问题是baseball-reference.com评论了他们的html源代码的重要部分。作为一个没有研究过很多html的人,我仍然对这些评论表如何出现在网页上有点困惑。无论哪种方式,这都是它的关键。
以下是我在上面发布的棒球参考链接源代码中all_standard_fielding表的注释块示例:
</div>
<div class="placeholder"></div>
<!--
<div class="table_outer_container">
<div class="overthrow table_container" id="div_standard_fielding">
<table class="sortable stats_table" id="standard_fielding" data-cols-to-freeze=1><caption>Team Fielding--Totals</caption>
<colgroup><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col></colgroup>
<thead>
<tr>
.
.
. and so on.
整个表已被注释掉,因此在尝试从html源导入数据之前,需要使用regex命令删除这些注释。