import csv
import sys
import json
import urllib
import requests
from bs4 import BeautifulSoup
def getCollegeandURL():
f = open('colleges.csv', 'w')
f.write("Teams" + "," + "," + "URL" + '\n')
originalurl = "http://www.cfbstats.com/2014/player/index.html"
base = requests.get("http://www.cfbstats.com/2014/player/index.html")
base = base.text
soup = BeautifulSoup(base)
# this is to find all the colleges in the div conference
mydivs = soup.find_all('div',{'class': 'conference'})
g = open('rosters.csv', 'w')
g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n')
# this for loop finds writes each college to a line
for div in mydivs:
urls= div.findAll('a')
# this is to pull all the college names and each of their links
for url in urls:
college = url.text
url = url.attrs['href']
teamurl = originalurl[:23]+url
f.write(college[:]+ ',' + ',' + teamurl[:]+'\n')
scrapeRosters(college, teamurl, g)
def scrapeRosters(college, teamurl, g):
# g is the excel document to read into
# college is the college name
# teamurl is the url link to that team's roster
roster = requests.get(teamurl)
roster = roster.text
roster = BeautifulSoup(roster)
teamname = roster.find_all('h1' , {'id': 'pageTitle'})
teamAndPlayers = {}
table = roster.find_all('table', {'class' : 'team-roster'})
for i in table:
rows = i.find_all('tr')
for row in rows[1:]:
# this retrieves the player url
for item in row.findAll('a'):
if item not in row.findAll('a'):
row = row.text
row = row.split('\n')
row = str(row)
g.write(college + ',' + row + ',' + ',' + '\n')
elif (item['href'].startswith('/')):
playerurl = item.attrs['href']
row = row.text
row = row.split('\n')
row = str(row)
g.write(college + ',' + row + ',' + ',' + playerurl + ',' + '\n')
def main():
答案 0 :(得分:0)
import urllib, bs4
data = urllib.urlopen('http://www.cfbstats.com/2014/team/140/roster.html')
soup = bs4.BeautifulSoup(data.read()) # creates a BS4 HTML parsing object
for row in soup('tr')[1:]:
data = [str(i.getText()) for i in row('td')]
link = row('td')[1]('a') # the linked player
if len(link) > 0:
link = str(link[0]['href'])
data = [str(link)] + data
print data
print '\n'