Question

@martineau我已经更新了我的代码，这是你的意思吗？我如何处理KeyError而不是NameError？

url = "http://app2.nea.gov.sg/anti-pollution-radiation-protection/air-pollution/psi/psi-readings-over-the-last-24-hours"
web_soup = soup(urllib2.urlopen(url))

table = web_soup.find(name="div", attrs={'class': 'c1'}).find_all(name="div")[4].find_all('table')[0]

data = {}
cur_time = datetime.datetime.strptime("12AM", "%I%p")
for tr_index, tr in enumerate(table.find_all('tr')):
    if 'Time' in tr.text:
        continue
    for td_index, td in enumerate(tr.find_all('td')):
        if not td_index:
            continue
        data[cur_time] = td.text.strip()

        if td.find('strong'):
            bold_time = cur_time
            data[bold_time] = '20'
        cur_time += datetime.timedelta(hours=1)

        default_value = '20' # whatever you want it to be

    try:
        bold = data[bold_time]
    except NameError:

        bold_time = beforebold = beforebeforebold = default_value
    # might want to set "bold" to something, too, if needed
    else:   
        beforebold = data.get(bold_time - datetime.timedelta(hours=1)) 
        beforebeforebold =  data.get(bold_time - datetime.timedelta(hours=2))

这是我打印数据进行计算的地方。

print bold
print beforebold
print beforebeforebold

Answer 1

您需要添加一些内容来设置data[bold_time]：

    if td.find('strong'):
        bold_time = cur_time
        data[bold_time] = ????? # whatever it should be
    cur_time += datetime.timedelta(hours=1)

只要找到单词NameError，就应避免KeyError和strong例外。您仍然可能希望进行防御性编码并优雅地处理其中一个或两个。那意味着什么例外，处理那些不应该发生的例外情况......

Answer 2

我之前的帖子已经读过，然后我才读到这篇文章我觉得很可惜使用BeautifulSoup作为你的目标，因为从我看到的代码中，我发现它的使用很复杂，而事实是正则表达式的运行速度比BeautifulSoup大约快10倍。

这是仅包含re的代码，它提供您感兴趣的数据。
我知道，有人会说正则表达式无法解析HTML文本。我知道，我知道......但我不解析文本，我直接找到有趣的文本块。这个网站的网页源代码显然结构很好，似乎没有什么风险。此外，还可以添加测试和验证，以便密切关注源代码，并立即获知网站管理员在网页中可能发生的变化

import re
from httplib import HTTPConnection

hypr = HTTPConnection(host='app2.nea.gov.sg',
                      timeout = 300)
rekete = ('/anti-pollution-radiation-protection/'
          'air-pollution/psi/'
          'psi-readings-over-the-last-24-hours')

hypr.request('GET',rekete)
page = hypr.getresponse().read()


patime = ('PSI Readings.+?'
          'width="\d+%" align="center">\r\n'
          ' *<strong>Time</strong>\r\n'
          ' *</td>\r\n'
          '((?: *<td width="\d+%" align="center">'
          '<strong>\d+AM</strong>\r\n'
          ' *</td>\r\n)+.+?)'

          'width="\d+%" align="center">\r\n'
          ' *<strong>Time</strong>\r\n'
          ' *</td>\r\n'
          '((?: *<td width="\d+%" align="center">'
          '<strong>\d+PM</strong>\r\n'
          ' *</td>\r\n)+.+?)'
          'PM2.5 Concentration')
rgxtime = re.compile(patime,re.DOTALL)


patline = ('<td align="center">\r\n'
           ' *<strong>'             # next line = group 1
           '(North|South|East|West|Central|Overall Singapore)'
           '</strong>\r\n'
           ' *</td>\r\n'
           '((?: *<td align="center">\r\n'  # group 2 start
           ' *[.\d-]+\r\n'                  #
           ' *</td>\r\n)*)'                 # group 2 end

           ' *<td align="center">\r\n'
           ' *<strong style[^>]+>'
           '([.\d-]+)' # group 3
           '</strong>\r\n'
           ' *</td>\r\n')
rgxline = re.compile(patline)

rgxnb = re.compile('<td align="center">\r\n'
                   ' *([.\d-]+)\r\n'
                   ' *</td>\r\n')


m= rgxtime.search(page)

a,b = m.span(1) # m.group(1) contains the data AM
d = dict((mat.group(1),
          rgxnb.findall(mat.group(2))+[mat.group(3)])
         for mat in rgxline.finditer(page[a:b]))

a,b = m.span(2) # m.group(2) contains the data PM
for mat in rgxline.finditer(page[a:b]):
    d[mat.group(1)].extend(rgxnb.findall(mat.group(2))+[mat.group(3)])


print 'last 3 values'
for k,v in d.iteritems():
    print '%s  :  %s' % (k,v[-3:])

Python如何添加异常？

2 个答案: