好的我之前不太清楚。所以我要做的是从http://www.cfbstats.com/2014/player/index.html获取大学团队和他们的网址列表并导出到csv。我成功地做到了。从那里我进入每个团队并抓住每个球员和他们的链接。如果玩家没有链接,那么它只会将他们的数据放入csv。我目前只有拥有网址但不拥有网址的玩家。最终我会想进入每个玩家并获取他们的每个统计数据并写入csv。
对于原帖中的所有混淆感到抱歉。
import csv
import sys
import json
import urllib
import requests
from bs4 import BeautifulSoup
def getCollegeandURL():
f = open('colleges.csv', 'w')
f.write("Teams" + "," + "," + "URL" + '\n')
originalurl = "http://www.cfbstats.com/2014/player/index.html"
base = requests.get("http://www.cfbstats.com/2014/player/index.html")
base = base.text
soup = BeautifulSoup(base)
# this is to find all the colleges in the div conference
mydivs = soup.find_all('div',{'class': 'conference'})
##g
g = open('rosters.csv', 'w')
g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n')
# this for loop finds writes each college to a line
for div in mydivs:
urls= div.findAll('a')
# this is to pull all the college names and each of their links
for url in urls:
college = url.text
url = url.attrs['href']
teamurl = originalurl[:23]+url
f.write(college[:]+ ',' + ',' + teamurl[:]+'\n')
scrapeRosters(college, teamurl, g)
def scrapeRosters(college, teamurl, g):
# g is the excel document to read into
# college is the college name
# teamurl is the url link to that team's roster
roster = requests.get(teamurl)
roster = roster.text
roster = BeautifulSoup(roster)
teamname = roster.find_all('h1' , {'id': 'pageTitle'})
teamAndPlayers = {}
table = roster.find_all('table', {'class' : 'team-roster'})
for i in table:
rows = i.find_all('tr')
for row in rows[1:]:
# this retrieves the player url
for item in row.findAll('a'):
if item not in row.findAll('a'):
row = row.text
row = row.split('\n')
row = str(row)
g.write(college + ',' + row + ',' + ',' + '\n')
elif (item['href'].startswith('/')):
playerurl = item.attrs['href']
row = row.text
row = row.split('\n')
row = str(row)
g.write(college + ',' + row + ',' + ',' + playerurl + ',' + '\n')
def main():
getCollegeandURL()
main()
我认为错误在于if和elif语句。
答案 0 :(得分:0)
import urllib, bs4
data = urllib.urlopen('http://www.cfbstats.com/2014/team/140/roster.html')
soup = bs4.BeautifulSoup(data.read()) # creates a BS4 HTML parsing object
for row in soup('tr')[1:]:
data = [str(i.getText()) for i in row('td')]
link = row('td')[1]('a') # the linked player
if len(link) > 0:
link = str(link[0]['href'])
data = [str(link)] + data
print data
print '\n'