I am trying to parse an ESPN webpage to get the date, time, and teams playing in each NFL game for a given week using BeautifulSoup. I am able to get most of the information, however, I am having trouble with the time information.
For some reason, the text between the a tag is not being returned.
The html for one of the a tags is:
<a data-dateformat="time1" name="&lpos=nfl:schedule:time" href="/nfl/game?gameId=400874572">12:00 PM</a>
I am looking to get the "12:00 PM" in between the a tags, but instead I get:
<a data-dateformat="time1" href="/nfl/game?gameId=400874572" name="&lpos=nfl:schedule:time"></a>
which doesn't have any text in between the tags.
Here is what I have used to parse the webpage.
import urllib2
from bs4 import BeautifulSoup
def parse_nfl_schedule_espn():
schedule = BeautifulSoup(urllib2.urlopen("http://www.espn.com/nfl/schedule/_/week/10").read(), "lxml")
for date in schedule.find_all('h2'):
#separate by game
game_info = date.nextSibling.find_all('tr')
date = str(date).split(">")
date = date[1].split("<")
date = date[0]
#print date
for i in range(len(game_info)):
#separate each part of game row
value = game_info[i].find_all('td')
#iterate over <thead>
if len(value) > 1:
#away team abv
away = str(value[0].find('abbr')).split(">")
away = away[1].split("<")
away = away[0]
#home team abv
home = str(value[1].find('abbr')).split(">")
home = home[1].split("<")
home = home[0]
time = value[2].find_all('a')
print time
#print "%s at %s" % (away, home)
if __name__ == "__main__":
parse_nfl_schedule_espn()
Any help/suggestions would be much appreciated.
答案 0 :(得分:1)
您需要使用Selenium之类的东西来获取HTML。这将允许浏览器运行任何Javascript。这可以按如下方式完成:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
def parse_nfl_schedule_espn():
browser = webdriver.Firefox(firefox_binary=FirefoxBinary())
browser.get("http://www.espn.com/nfl/schedule/_/week/10")
schedule = BeautifulSoup(browser.page_source, "lxml")
for date in schedule.find_all('a', attrs={'data-dateformat' : "time1"}):
print date.text
if __name__ == "__main__":
parse_nfl_schedule_espn()
将显示以下内容:
6:00 PM
6:00 PM
6:00 PM
6:00 PM
6:00 PM
6:00 PM
6:00 PM
6:00 PM
9:05 PM
9:25 PM
9:25 PM
1:30 AM
1:30 AM
您还可以调查PhantomJS等“无头”解决方案,以避免显示浏览器窗口。