如果Link不存在,则检索Beautiful Soup中的数据和链接并检索数据

时间:2015-04-23 21:09:40

标签: python python-2.7 web-scraping beautifulsoup

好的我之前不太清楚。所以我要做的是从http://www.cfbstats.com/2014/player/index.html获取大学团队和他们的网址列表并导出到csv。我成功地做到了。从那里我进入每个团队并抓住每个球员和他们的链接。如果玩家没有链接,那么它只会将他们的数据放入csv。我目前只有拥有网址但不拥有网址的玩家。最终我会想进入每个玩家并获取他们的每个统计数据并写入csv。

对于原帖中的所有混淆感到抱歉。

import csv
import sys
import json
import urllib
import requests
from bs4 import BeautifulSoup




def getCollegeandURL():



    f = open('colleges.csv', 'w')

    f.write("Teams" + "," + "," + "URL" + '\n')
    originalurl = "http://www.cfbstats.com/2014/player/index.html"
    base = requests.get("http://www.cfbstats.com/2014/player/index.html")
    base = base.text
    soup = BeautifulSoup(base)





    # this is to find all the colleges in the div conference
    mydivs = soup.find_all('div',{'class': 'conference'}) 

    ##g
    g = open('rosters.csv', 'w')
    g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n')


    # this for loop finds writes each college to a line
    for div in mydivs:
        urls= div.findAll('a')


    # this is to pull all the college names and each of their links
    for url in urls:


        college = url.text
        url = url.attrs['href']

        teamurl = originalurl[:23]+url

        f.write(college[:]+ ',' + ',' + teamurl[:]+'\n')


        scrapeRosters(college, teamurl, g)








def scrapeRosters(college, teamurl, g):

# g is the excel document to read into
# college is the college name
# teamurl is the url link to that team's roster   

roster = requests.get(teamurl)
roster = roster.text
roster = BeautifulSoup(roster)

teamname = roster.find_all('h1' , {'id': 'pageTitle'})

teamAndPlayers = {}
table = roster.find_all('table', {'class' : 'team-roster'})

for i in table:

    rows = i.find_all('tr')


    for row in rows[1:]:

        # this retrieves the player url
        for item in row.findAll('a'):

            if item not in row.findAll('a'):

                row = row.text
                row = row.split('\n')
                row = str(row)

                g.write(college + ',' + row + ',' + ',' + '\n')

            elif (item['href'].startswith('/')):
                playerurl = item.attrs['href']


                row = row.text
                row = row.split('\n')

                row = str(row)


                g.write(college + ',' + row + ',' + ',' + playerurl + ',' + '\n')

def main():
    getCollegeandURL()



main()      

我认为错误在于if和elif语句。

1 个答案:

答案 0 :(得分:0)

import urllib, bs4

data = urllib.urlopen('http://www.cfbstats.com/2014/team/140/roster.html')
soup = bs4.BeautifulSoup(data.read()) # creates a BS4 HTML parsing object

for row in soup('tr')[1:]:
    data = [str(i.getText()) for i in row('td')]
    link = row('td')[1]('a') # the linked player

    if len(link) > 0:
        link = str(link[0]['href'])
        data = [str(link)] + data

    print data
    print '\n'