我正在努力学习报废,
我在代码中使用较低的异常来传递错误,因为它们不会影响将数据写入csv
我一直得到一个" socket.gaierror"但是在处理中有一个" urllib.error.URLError"在处理我得到" NameError:name' socket'没有定义"这似乎很迂回
我有点明白,使用这些异常可能不是运行代码的最佳方式,但我似乎无法克服这些错误,我不知道解决方法或如何修复错误。
如果您在修复错误异常之外有任何建议,那么也会非常感激。
import csv
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
base_url = 'http://www.fangraphs.com/' # used in line 27 for concatenation
years = ['2017','2016','2015'] # for enough data to run tests
#Getting Links for letters
player_urls = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
player_urls.append(base_url + link['href'])
#Getting Alphabet Links
test_for_playerlinks = 'players.aspx?letter='
player_alpha_links = []
for i in player_urls:
if test_for_playerlinks in i:
player_alpha_links.append(i)
# Getting Player Links
ind_player_urls = []
for l in player_alpha_links:
data = urlopen(l)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
ind_player_urls.append(link['href'])
#Player Links
jan = 'statss.aspx?playerid'
players = []
for j in ind_player_urls:
if jan in j:
players.append(j)
# Building Pitcher List
pitcher = 'position=P'
pitchers = []
pos_players = []
for i in players:
if pitcher in i:
pitchers.append(i)
else:
pos_players.append(i)
# Individual Links to Different Tables Sorted by Base URL differences
splits = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs = 'http://www.fangraphs.com/statsd.aspx?'
split_pp = []
gamel = []
years = ['2017','2016','2015']
for i in pos_players:
for year in years:
split_pp.append(splits + i[12:]+'&season='+ year)
gamel.append(game_logs+ i[12:] + '&type=&gds=&gde=&season=' + year)
split_pitcher = []
gl_pitcher = []
for i in pitchers:
for year in years:
split_pitcher.append(splits + i[12:]+'&season=' + year)
gl_pitcher.append(game_logs + i[12:] + '&type=&gds=&gde=&season=' + year)
# Splits for Pitcher Data
row_sp = []
rows_sp = []
try:
for i in split_pitcher:
sauce = urlopen(i)
soup = BeautifulSoup(sauce, "html.parser")
table1 = soup.find_all('strong', {"style":"font-size:15pt;"})
row_sp = []
for name in table1:
nam = name.get_text()
row_sp.append(nam)
table = soup.find_all('table', {"class":"rgMasterTable"})
for h in table:
he = h.find_all('tr')
for i in he:
td = i.find_all('td')
for j in td:
row_sp.append(j.get_text())
rows_sp.append(row_sp)
except(RuntimeError, TypeError, NameError, URLError, socket.gaierror):
pass
try:
with open('SplitsPitchingData2.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerows(rows_sp)
except(RuntimeError, TypeError, NameError):
pass
答案 0 :(得分:1)
我猜你的主要问题是你 - 没有任何睡眠 - 查询网站上有大量无效网址(你在2015-2017年为22880个投手创造了3个网址,但大多数这些不属于该范围,因此您有成千上万的返回错误的查询。)
我很惊讶您的IP未被网站管理员禁止。那说:做一些过滤会更好,所以你要避免所有这些错误查询...
我申请的过滤器并不完美。它检查列表中的年份是出现在网站上的开头还是结束年份(例如'2004 - 2015')。这也会创建错误链接,但没有办法接近原始脚本的数量。
在代码中它看起来像这样:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from time import sleep
import csv
base_url = 'http://www.fangraphs.com/'
years = ['2017','2016','2015']
# Getting Links for letters
letter_links = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
try:
link = base_url + link['href']
if 'players.aspx?letter=' in link:
letter_links.append(link)
except:
pass
print("[*] Retrieved {} links. Now fetching content for each...".format(len(letter_links)))
# the data resides in two different base_urls:
splits_url = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs_url = 'http://www.fangraphs.com/statsd.aspx?'
# we need (for some reason) players in two lists - pitchers_split and pitchers_game_log - and the rest of the players in two different, pos_players_split and pis_players_game_log
pos_players_split = []
pos_players_game_log = []
pitchers_split = []
pitchers_game_log = []
# and if we wanted to do something with the data from the letter_queries, lets put that in a list for safe keeping:
ind_player_urls = []
current_letter_count = 0
for link in letter_links:
current_letter_count +=1
data = urlopen(link)
soup = BeautifulSoup(data, "html.parser")
trs = soup.find('div', class_='search').find_all('tr')
for player in trs:
player_data = [tr.text for tr in player.find_all('td')]
# To prevent tons of queries to fangraph with invalid years - check if elements from years list exist with the player stat:
if any(year in player_data[1] for year in years if player_data[1].startswith(year) or player_data[1].endswith(year)):
href = player.a['href']
player_data.append(base_url + href)
# player_data now looks like this:
# ['David Aardsma', '2004 - 2015', 'P', 'http://www.fangraphs.com/statss.aspx?playerid=1902&position=P']
ind_player_urls.append(player_data)
# build the links for game_log and split
for year in years:
split = '{}{}&season={}'.format(splits_url,href[12:],year)
game_log = '{}{}&type=&gds=&gde=&season={}'.format(game_logs_url, href[12:], year)
# checking if the player is pitcher or not. We're append both link and name (player_data[0]), so we don't need to extract name later on
if 'P' in player_data[2]:
pitchers_split.append([player_data[0],split])
pitchers_game_log.append([player_data[0],game_log])
else:
pos_players_split.append([player_data[0],split])
pos_players_game_log.append([player_data[0],game_log])
print("[*] Done extracting data for players for letter {} out of {}".format(current_letter_count, len(letter_links)))
sleep(2)
# CONSIDER INSERTING CSV-PART HERE....
# Extracting and writing pitcher data to file
with open('SplitsPitchingData2.csv', 'a') as fp:
writer = csv.writer(fp)
for i in pitchers_split:
try:
row_sp = []
rows_sp = []
# all elements in the pitchers_split are lists. Player name is i[1]
data = urlopen(i[1])
soup = BeautifulSoup(data, "html.parser")
# append name to row_sp from pitchers_split
row_sp.append(i[0])
# the page has 3 tables with the class rgMasterTable, the first i Standard, the second Advanced, the 3rd Batted Ball
# we're only grabbing standard
table_standard = soup.find_all('table', {"class":"rgMasterTable"})[0]
trs = table_standard.find_all('tr')
for tr in trs:
td = tr.find_all('td')
for content in td:
row_sp.append(content.get_text())
rows_sp.append(row_sp)
writer.writerows(rows_sp)
sleep(2)
except Exception as e:
print(e)
pass
由于我不确定您希望如何在输出上格式化数据,因此您需要做一些工作。
如果你想在检索实际的投手统计数据之前等待提取所有的letter_links(并微调你的输出),你可以将csv编写器部分向上移动,因此它作为字母循环的一部分运行。如果你这样做,不要忘记在抓住另一个letter_link之前清空pitchers_split列表......