从Java背后访问数据

时间:2015-07-23 22:28:22

标签: python beautifulsoup

我试图从每个灯具的下拉列表中提取目标时间http://www.bbc.co.uk/sport/football/league-one/results

我无法在搜索时找到数据 - 任何想法为什么?

import requests
from bs4 import BeautifulSoup

# Load Page Data
r = requests.get("http://www.bbc.co.uk/sport/football/league-one/results")
soup = BeautifulSoup(r.content)
print soup.prettify()

# Save Teams
for link in soup.find_all("a"):
    print link.text

# Save Results
for link in soup.find_all("abbr"):
    print link.text

FF

1 个答案:

答案 0 :(得分:0)

所以这是非常重的数据(更不用说因为加载所有单独的页面而缓慢)并且它们可能最终会阻止你查看过多的请求,但这是我看到的唯一方法。我所做的就是通过并获取与href按钮相关联的Results,加载该页面并解析它以获取其中的得分信息

import requests
from bs4 import BeautifulSoup

def parse_page(data):
        subsoup = BeautifulSoup(data)
        matchoverview = subsoup.find('div', attrs={'id':'match-overview'})
        print '--------------'
        homeTeam = matchoverview.find('div', attrs={'class':'team-match-details'}).findNext('span').findNext('a').text
        homeScore = matchoverview.find('div', attrs={'class':'team-match-details'}).findNext('span').findNext('span').text
        homeGoalScorers = ["Home Goal Scorers:"]
        for goals in matchoverview.find('div', attrs={'class':'team-match-details'}).findNext('p').find_all('span'):
            homeGoalScorers.append(goals.text.replace(u'\u2032', "'"))
        homeGoals = "\n".join(homeGoalScorers)
        awayTeam = matchoverview.find('div', attrs={'id': 'away-team'}).find('div', attrs={'class':'team-match-details'}).findNext('span').findNext('a').text
        awayScore = matchoverview.find('div', attrs={'id': 'away-team'}).find('div', attrs={'class':'team-match-details'}).findNext('span').findNext('span').text
        awayGoalScorers = ["Away Goal Scorers:"]
        for goals in matchoverview.find('div', attrs={'id': 'away-team'}).find('div', attrs={'class':'team-match-details'}).findNext('p').find_all('span'):
            awayGoalScorers.append(goals.text.replace(u'\u2032', "'"))
        awayGoals = "\n".join(awayGoalScorers)
        print '{0} {1} - {2} {3}'.format(homeTeam, homeScore, awayTeam, awayScore)
        print homeGoals
        print awayGoals

def all_league_results():
    r = requests.get("http://www.bbc.co.uk/sport/football/league-one/results")
    soup = BeautifulSoup(r.content)

    # Save Teams
    for link in soup.find_all("a", attrs={'class': 'report'}):
        fullLink = 'http://www.bbc.com' + link['href']
        subr = requests.get(fullLink)
        parse_page(subr.text)

def specific_game_results(url):
    subr = requests.get(url)
    parse_page(subr.text)

#get specific games results
specific_game_results('http://www.bbc.co.uk/sport/0/football/32460049')
#get all current league results
all_league_results()