我想使用Python中的beautifulsoup4提取表数据,这是在#34;星期六,5月24日和#34;来自这个网页。
http://www.afl.com.au/fixture?roundId=CD_R201401410#tround
屏幕截图:
从上面的截图中可以看出,我需要在5月24日星期六之前提取所有预览超链接。
请帮帮我。
我试过这段代码
from bs4 import BeautifulSoup
import urllib2
import sys
lPreviewLinkList = []
lLink = "http://www.afl.com.au/fixture?roundId=CD_R201401410#tround"
header = {'User-Agent': 'Mozilla/5.0'}
req_for_players = urllib2.Request(lLink,headers=header)
page_for_players = urllib2.urlopen(req_for_players)
soup_for_players = BeautifulSoup(page_for_players)
table_for_players = soup_for_players.find("table", { "class" : "fancy-zebra fixture" })
for row in table_for_players.find_all("tbody"):
for cell in row.find_all("th"):
lCellValue = cell.get_text()
#print lCellValue
for cell in row.find_all("li"):
lCellValue = cell.get_text()
if lCellValue == "Preview":
for link in cell.find_all('a'):
lPreviewLinkList.append("http://www.afl.com.au/" + link.get('href'))
答案 0 :(得分:0)
也许这会有所帮助:
from BeautifulSoup import BeautifulSoup
import urllib2
import sys
lPreviewLinkList = []
bCapture = False
lLink = "http://www.afl.com.au/fixture?roundId=CD_R201401410#tround"
header = {'User-Agent': 'Mozilla/5.0'}
req_for_players = urllib2.Request(lLink,headers=header)
page_for_players = urllib2.urlopen(req_for_players)
soup_for_players = BeautifulSoup(page_for_players)
table_for_players = soup_for_players.find("table", { "class" : "fancy-zebra fixture" })
for row in table_for_players.findAll("tr"):
for dateRow in row.findAll("th", {"colspan" : "4"}):
bCapture = True if dateRow.text == "Saturday, May 24" else False
if bCapture:
for infoRow in row.findAll("td", {"class": "info"}):
for link in infoRow.findAll("a", {"class" : "preview"}):
lPreviewLinkList.append("http://www.afl.com.au/" + link.get('href'))