我正在使用python和美丽的汤来解析这个网页。 https://rpi.sodexomyway.com/dining-choices/res/sage.html在“菜单上”部分,我想获取第一个链接的网址。
以下是我正在使用的代码:
monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
现在它正在获得第二个标签,我不知道为什么,我认为它至少会得到两个,但它只能得到第二个。
我想改变代码,以便它获得第一个标签,或者我可以搜索标签所说的内容并获得它。
对于第二部分,我只是在讨论,例如,如果a标签是
<a new tag </a>
我想搜索“新标签”
编辑:
完整代码,我需要当前的周链接,所以要么从菜单部分获取第一个,以按该链接的日期搜索...
# Created by Spencer Fontein on 5/28/14.
# Copyright (c) 2014 Spencer Fontein. All rights reserved.
# coding: utf-8
import pprint
from lxml import etree
import cgi
from bs4 import BeautifulSoup
import datetime
import urllib2
import cookielib
import re
#where to send the file at the end
output_path = ""#"/home/spencerf/public_html/rpi/"
def Get_website_text(url):
# url for website
base_url = url
# file for storing cookies
cookie_file = 'mfp.cookies'
# set up a cookie jar to store cookies
cj = cookielib.MozillaCookieJar(cookie_file)
# set up opener to handle cookies, redirects etc
opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cj)
)
# pretend we're a web browser and not a python script
opener.addheaders = [('User-agent',
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
'AppleWebKit/535.1 (KHTML, like Gecko) '
'Chrome/13.0.782.13 Safari/535.1'))
]
# open the front page of the website to set
# and save initial cookies
response = opener.open(base_url)
web_text = response.read()
response.close()
return web_text
#get union menus
def getUnionMenuUrls(soup):
monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
#print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
#print soup.find(text=re.compile('9/22/2014 - 9/28/2014'))
menu_urls = []
url = "https://rpi.sodexomyway.com"
for tag in monthly_urls:
if ".htm" in tag['href']:
name = str(tag.text)
name = name.replace("Click ",'').replace('For ','').replace('Menu ','').replace('of ','').replace('Week ','').replace('Here ','').replace('Of ','')
name = name.replace('January ','').replace('February ','').replace('March ','').replace('April ','').replace('May ','')
name = name.replace('June ','').replace('July ','').replace('August ','').replace('September ','')
name = name.replace('October ','').replace('November ','').replace('December ','')
name = name.replace('1','').replace("2", '').replace("3", '').replace("4", '')
name = name.replace('5','').replace("6", '').replace("7", '').replace("8", '')
name = name.replace('9','').replace("0", '').replace('-','')
name = name.replace('\n','rpi_russell_sage_menu').replace('/','')
name = name.replace('!','').replace(', ','').replace(' ','').replace('College','')
newurl = url + tag['href']
menu_urls.append([name,newurl])
return menu_urls
def get_xml(url):
tag_stack = []
output_lines = []
html = urllib2.urlopen(url).read().replace(' ',"")
xml = etree.HTML(html)
open_tag(tag_stack, output_lines, "menu", "")
days = xml.xpath('//td[@class="dayouter"]')
# make the xml for each day
for day in days:
day_name = day.xpath('./a/@name')[0]
safe_open_tag(tag_stack, output_lines, "day", "menu", day_name)
dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr')
for dayinner_tr in dayinner_trs:
# change meal
if (dayinner_tr.xpath('./td[@class="mealname"]')):
meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0]
safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name)
# change counter
if (dayinner_tr.xpath('./td[@class="station"]/text()')):
counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0]
safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name)
# change dish
if (dayinner_tr.xpath('./td[@class="menuitem"]')):
item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip()
safe_open_tag(tag_stack, output_lines, "dish", "counter", "")
output_lines.append("<name>%s</name>" % cgi.escape(item_name))
close_tags(tag_stack, output_lines, "")
output_string = '\n'.join([line.encode('utf-8') for line in output_lines])
return output_string
# close the tags up to the parent of last tag in tag_stack
def close_tags(tag_stack, output_lines, parent_tag):
while tag_stack and tag_stack[-1] != parent_tag:
top = tag_stack.pop()
output_lines.append(' ' * len(tag_stack) + '</%s>' % top)
# open the new_tag using the suitable style based on name_property
def open_tag(tag_stack, output_lines, new_tag, name_property):
if name_property:
output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property))
else:
output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag)
tag_stack.append(new_tag)
# check if the new_tag parent is in the stack, if not it'll add the parent
def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property):
if parent_tag not in tag_stack:
output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag)
tag_stack.append(parent_tag)
else:
close_tags(tag_stack, output_lines, parent_tag)
open_tag(tag_stack, output_lines, new_tag, name_property)
# sample use of get_xml function
# In[17]:
if __name__ == "__main__":
base_url_u = "https://rpi.sodexomyway.com/dining-choices/res/sage.html"
htmltext_u = Get_website_text(base_url_u)
soup_u = BeautifulSoup(htmltext_u)
menu_url_list = getUnionMenuUrls(soup_u)
for menu in menu_url_list:
if '.htm' in menu[1]:
ofname = str(menu[0].replace(" ","A")) + ".xml"
output_file = output_path + ofname
open(output_file, "w").write(get_xml(menu[1]))
else:
print menu[0],":",menu[1], "is not valid html."
编辑2:
date function
def getCurrentWeekMenu(date1,date2):
now = datetime.datetime.now()
monthstr = "January,February,March,April,May,June,July,August,September,October,November,December"
months = monthstr.split(',')
d = dict(zip(months,range(1,13)))
menu_1_month = d[str(date1[0])]
menu_2_month = d[str(date2[0])]
menu_1_day = str(date1[1][:-2])
menu_2_day = str(date2[1][:-2])
if menu_1_day > menu_2_day:
if now.day >= menu_1_day:
menu = 1
else:
menu = 2
else:
if now.day >= menu_2_day:
menu = 2
elif now.month > menu_1_month:
menu = 2
else:
menu = 1
return menu-1
答案 0 :(得分:1)
我在运行代码时没有问题
from BeautifulSoup import BeautifulSoup
import requests
response = requests.get('https://rpi.sodexomyway.com/dining-choices/res/sage.html')
soup = BeautifulSoup(response.text)
#output of your code
print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
>>> [<a href="#">On the Menu</a>,
<a href="/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm" target="_blank">
9/22/2014 - 9/28/2014</a>,
<a href="/images/WeeklyMenuRSDH%209-29-14_tcm1068-29441.htm" target="_blank">
9/29/2014 - 10/5/2014</a>,
<a href="#">Hours of Operation</a>]
# now get the href
url = dict(soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1].attrs)['href']
# output
u'/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm'
回答问题的第二部分
import re
soup.find(text=re.compile('new tag'))
更新 - 添加当前周过滤器
def getUnionMenuUrls(soup):
monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1:3] # cut extra links
today = datetime.datetime.today() # get todays date
url = "https://rpi.sodexomyway.com"
for tag in monthly_urls:
if ".htm" in tag['href']:
name = str(tag.text)
datestrings = name.split(' - ') # split string and get the list of dates
date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
if date_range[0] <= today <= date_range[1]: # check if today in that range
return url + tag['href']