import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2
base_url = 'http://www.baseball-reference.com'
data = requests.get("http://www.baseball-reference.com/players/")
soup = BeautifulSoup(data.content)
player_url = 'http://www.baseball-reference.com/players/'
game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id='
years = ['2000','2001','2002','2003','2004','2005','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015']
url = []
for link in soup.find_all('a'):
if link.has_attr('href'):
base_url + link['href']
url.append(base_url + link['href'])
sink = []
for l in url:
if l[0:42] in player_url:
sink.append(l)
abc = []
for aa in sink:
if len(aa) > 48:
abc.append(aa)
urlz = []
for ab in abc:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
abc = []
for aa in urlz:
if game_logs in aa:
abc.append(aa)
urlll = []
for ab in years:
for ac in abc:
if ab in ac:
urlll.append(ac)
for j in urlll:
response = requests.get(j)
html = response.content
soup = BeautifulSoup(html)
table = soup.find('table', attrs={'id': 'batting_gamelogs'})
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '').encode("utf-8")
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
print list_of_rows
当我遍历网址以获取表格时,表格中不存在表格。我收到一个错误,看起来像是:
Traceback (most recent call last):
File "py5.py", line 55, in <module>
list_of_cells.append(text)
AttributeError: 'NoneType' object has no attribute 'findAll'
即使没有桌子,还有办法继续循环吗?
答案 0 :(得分:1)
使用try and except并处理错误
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '').encode("utf-8")
try:
list_of_cells.append(text)
except Exception, e:
# handle exception
list_of_rows.append(list_of_cells)