所以我创建了一个web scraper,进入cfbstats.com/2014/player/index.html并检索所有大学橄榄球队和足球队的链接。从那里它进入每个链接并采取名册和球员链接。最后,它进入每个球员链接并获取他的统计数据。
我目前在接受球员统计数据方面存在问题。当我调用每个表的标题时,我得到打印输出[Tackle],当调用表格的第一行时,我得到[G]。我想摆脱那些标签。我以前的功能我没有能够拥有它们。任何帮助将不胜感激。
import csv
import sys
import json
import urllib
import requests
from bs4 import BeautifulSoup
import xlrd
import xlwt
def getCollegeandURL():
f = open('colleges.csv', 'w')
f.write("Teams" + "," + "," + "URL" + '\n')
originalurl = "http://www.cfbstats.com/2014/player/index.html"
base = requests.get("http://www.cfbstats.com/2014/player/index.html")
base = base.text
soup = BeautifulSoup(base)
# this is to find all the colleges in the div conference
mydivs = soup.find_all('div',{'class': 'conference'})
##g is an excel document for the roster
g = open('rosters.csv', 'w')
g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n')
# h is an excel for each player stats
h = xlwt.Workbook()
# this for loop finds writes each college to a line
for div in mydivs:
urls= div.findAll('a')
# this is to pull all the college names and each of their links
for url in urls:
college = url.text
url = url.attrs['href']
teamurl = originalurl[:23]+url
f.write(college[:]+ ',' + ',' + teamurl[:]+'\n')
scrapeRosters(college, teamurl, g, h)
############################################################################
def scrapeRosters(college, teamurl, g, h):
# create the excel documents
# this gets the pages of teams
roster = requests.get(teamurl)
roster = roster.text
roster = BeautifulSoup(roster)
teamname = roster.find_all('h1' , {'id': 'pageTitle'})
teamAndPlayers = {}
table = roster.find_all('table', {'class' : 'team-roster'})
for i in table:
rows = i.find_all('tr')
for row in rows[1:]:
data = [str(i.getText()) for i in row('td')]
link = row('td')[1]('a')
if len(link) > 0:
link = str(link[0]['href'])
data = [str(link)] + data
# unpacking data into variables
(playerurl, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) = data
# creating the full player url
playerurl = teamurl[:23] + playerurl
# repacking the data
data = (college, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool)
g.write(college + ',' + playernumber + ',' + playerName + ',' + playerPosition + ','+ YearinCollege + ',' + playerHeight + ',' + playerWeight + ',' + playerHometown + ',' + lastSchool+ ',' + ',' + playerurl + ',' + '\n')
playerStats(data, playerurl, h)
############################################################################
def playerStats(data,playerurl, h):
playerurl = requests.get(playerurl)
playerurl = playerurl.text
playerurl = BeautifulSoup(playerurl)
tablestats = playerurl.find_all('table', {'class' : 'player-home'})
(college, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) = data
#print college, playernumber, playerName
print college, playerName, playernumber
for x in tablestats:
caption = x.find_all('caption')
rows = x.find_all('tr')
## caption = caption.strip
for row in rows:
headers = x.find_all('th')
headers = [str(i.getText()) for i in row('tr')]
stats = [str(x.getText()) for x in row('td')]
print caption, headers, stats
############################################################################
def main():
getCollegeandURL()
main()