Python - 首先获得解析HTML的标签

时间:2014-09-25 02:53:53

标签: python html parsing beautifulsoup

我正在使用python和美丽的汤来解析这个网页。 https://rpi.sodexomyway.com/dining-choices/res/sage.html在“菜单上”部分,我想获取第一个链接的网址。

以下是我正在使用的代码:

monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)

现在它正在获得第二个标签,我不知道为什么,我认为它至少会得到两个,但它只能得到第二个。

我想改变代码,以便它获得第一个标签,或者我可以搜索标签所说的内容并获得它。

对于第二部分,我只是在讨论,例如,如果a标签是

<a new tag </a>

我想搜索“新标签”

编辑:

完整代码,我需要当前的周链接,所以要么从菜单部分获取第一个,以按该链接的日期搜索...

    #  Created by Spencer Fontein on 5/28/14.
#  Copyright (c) 2014 Spencer Fontein. All rights reserved.

# coding: utf-8

import pprint
from lxml import etree
import cgi
from bs4 import BeautifulSoup
import datetime
import urllib2
import cookielib
import re

#where to send the file at the end
output_path = ""#"/home/spencerf/public_html/rpi/"

def Get_website_text(url):

    # url for website        
    base_url = url

    # file for storing cookies       
    cookie_file = 'mfp.cookies'

    # set up a cookie jar to store cookies
    cj = cookielib.MozillaCookieJar(cookie_file)

    # set up opener to handle cookies, redirects etc
    opener = urllib2.build_opener(
         urllib2.HTTPRedirectHandler(),
         urllib2.HTTPHandler(debuglevel=0),
         urllib2.HTTPSHandler(debuglevel=0),            
         urllib2.HTTPCookieProcessor(cj)
    )

    # pretend we're a web browser and not a python script
    opener.addheaders = [('User-agent',
        ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
         'AppleWebKit/535.1 (KHTML, like Gecko) '
         'Chrome/13.0.782.13 Safari/535.1'))
    ]

    # open the front page of the website to set
    # and save initial cookies
    response = opener.open(base_url)
    web_text = response.read()
    response.close()

    return web_text

#get union menus
def getUnionMenuUrls(soup):
    monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
    #print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
    #print soup.find(text=re.compile('9/22/2014 - 9/28/2014'))
    menu_urls = []
    url = "https://rpi.sodexomyway.com"
    for tag in monthly_urls:
        if ".htm" in tag['href']:
            name = str(tag.text)
            name = name.replace("Click ",'').replace('For ','').replace('Menu ','').replace('of ','').replace('Week ','').replace('Here ','').replace('Of ','')
            name = name.replace('January ','').replace('February ','').replace('March ','').replace('April ','').replace('May ','')
            name = name.replace('June ','').replace('July ','').replace('August ','').replace('September ','')
            name = name.replace('October ','').replace('November ','').replace('December ','')
            name = name.replace('1','').replace("2", '').replace("3", '').replace("4", '')
            name = name.replace('5','').replace("6", '').replace("7", '').replace("8", '')
            name = name.replace('9','').replace("0", '').replace('-','')
            name = name.replace('\n','rpi_russell_sage_menu').replace('/','')
            name = name.replace('!','').replace(', ','').replace(' ','').replace('College','')
            newurl = url + tag['href']
            menu_urls.append([name,newurl])
    return menu_urls


def get_xml(url):
    tag_stack = []
    output_lines = []

    html = urllib2.urlopen(url).read().replace('&nbsp;',"")
    xml = etree.HTML(html)

    open_tag(tag_stack, output_lines, "menu", "")
    days = xml.xpath('//td[@class="dayouter"]')
    # make the xml for each day
    for day in days:
        day_name = day.xpath('./a/@name')[0]
        safe_open_tag(tag_stack, output_lines, "day", "menu", day_name)

        dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr') 
        for dayinner_tr in dayinner_trs:
            # change meal
            if (dayinner_tr.xpath('./td[@class="mealname"]')):
                meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0]
                safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name)

            # change counter
            if (dayinner_tr.xpath('./td[@class="station"]/text()')):                
                counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0]
                safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name)

            # change dish
            if (dayinner_tr.xpath('./td[@class="menuitem"]')):
                item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip()
                safe_open_tag(tag_stack, output_lines, "dish", "counter", "")
                output_lines.append("<name>%s</name>" % cgi.escape(item_name))

    close_tags(tag_stack, output_lines, "")
    output_string = '\n'.join([line.encode('utf-8') for line in output_lines])

    return output_string

# close the tags up to the parent of last tag in tag_stack
def close_tags(tag_stack, output_lines, parent_tag):
    while tag_stack and tag_stack[-1] != parent_tag:
        top = tag_stack.pop()
        output_lines.append(' ' * len(tag_stack) + '</%s>' % top)

# open the new_tag using the suitable style based on name_property
def open_tag(tag_stack, output_lines, new_tag, name_property):
    if name_property:
        output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property))
    else:
        output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag)
    tag_stack.append(new_tag)

# check if the new_tag parent is in the stack, if not it'll add the parent
def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property):
    if parent_tag not in tag_stack:
        output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag)
        tag_stack.append(parent_tag)
    else:   
        close_tags(tag_stack, output_lines, parent_tag)
    open_tag(tag_stack, output_lines, new_tag, name_property)

# sample use of get_xml function


# In[17]:

if __name__ == "__main__":
    base_url_u = "https://rpi.sodexomyway.com/dining-choices/res/sage.html"
    htmltext_u = Get_website_text(base_url_u)
    soup_u = BeautifulSoup(htmltext_u)
    menu_url_list = getUnionMenuUrls(soup_u)
    for menu in menu_url_list:
        if '.htm' in menu[1]:
            ofname = str(menu[0].replace(" ","A")) + ".xml"
            output_file = output_path + ofname
            open(output_file, "w").write(get_xml(menu[1]))
        else:
            print menu[0],":",menu[1], "is not valid html."

编辑2:

date function

def getCurrentWeekMenu(date1,date2):
    now = datetime.datetime.now()
    monthstr = "January,February,March,April,May,June,July,August,September,October,November,December"   
    months = monthstr.split(',')
    d = dict(zip(months,range(1,13))) 
    menu_1_month = d[str(date1[0])]
    menu_2_month = d[str(date2[0])]
    menu_1_day = str(date1[1][:-2])
    menu_2_day = str(date2[1][:-2])
    if menu_1_day > menu_2_day:
        if now.day >= menu_1_day:
            menu = 1
        else:
            menu = 2
    else:
        if now.day >= menu_2_day:
            menu = 2
        elif now.month > menu_1_month:
            menu = 2
        else:
            menu = 1
    return menu-1

1 个答案:

答案 0 :(得分:1)

我在运行代码时没有问题

from BeautifulSoup import BeautifulSoup
import requests
response = requests.get('https://rpi.sodexomyway.com/dining-choices/res/sage.html')
soup = BeautifulSoup(response.text)
#output of your code
print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)

>>> [<a href="#">On the Menu</a>,
     <a href="/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm" target="_blank">
                     9/22/2014 - 9/28/2014</a>,
     <a href="/images/WeeklyMenuRSDH%209-29-14_tcm1068-29441.htm" target="_blank">
                     9/29/2014 - 10/5/2014</a>,
     <a href="#">Hours of Operation</a>]

# now get the href
url = dict(soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1].attrs)['href']
# output
u'/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm'

回答问题的第二部分

import re
soup.find(text=re.compile('new tag'))

更新 - 添加当前周过滤器

def getUnionMenuUrls(soup):                                                      
    monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1:3] # cut extra links
    today = datetime.datetime.today() # get todays date                          
    url = "https://rpi.sodexomyway.com"                                          
    for tag in monthly_urls:                                                      
        if ".htm" in tag['href']:                                                
            name = str(tag.text)                                                 
            datestrings = name.split(' - ') # split string and get the list of dates
            date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
            if date_range[0] <= today <= date_range[1]: # check if today in that range
                return url + tag['href']