超出范围BeautifulSoup

时间:2017-01-18 12:16:28

标签: python beautifulsoup

我的列表索引超出范围错误,不知道为什么。我的代码是一个webscraper,用于从网站收集温度数据。直到最近,所有工作都好几个月。

我有很多功能如下所示。重要的是getDailyAve(),这是我引发异常的地方。

感谢任何想法或建议。

import sys
import urllib
from bs4 import BeautifulSoup
from urllib2 import urlopen, URLError
import webbrowser
import time
from collections import Counter
import numpy as np
import re
import csv
import datetime
from datetime import timedelta

DATE_FORMAT = '%Y/%m/%d'

def daterange(start, end):
      def convert(date):
            try:
                  date = datetime.datetime.strptime(date, DATE_FORMAT)
                  return date.date()
            except TypeError:
                  return date

      def get_date(n):
            return datetime.datetime.strftime(convert(start) + timedelta(days=n), DATE_FORMAT)

      days = (convert(end) - convert(start)).days
      if days <= 0:
            raise ValueError('The start date must be before the end date.')
      for n in range(0, days):
            yield get_date(n)

class SiteLocation:
  """class defining mine location parameters to lookup on weather search"""
  def __init__(self, city, state, zip, code):
    self.city = city
    self.state = state
    self.zip = zip
    self.code = code

def getDailyAve(url):
  url = urllib.urlopen(url)
  soup = BeautifulSoup(url.read(), 'lxml')
  form = soup.find("form",{"id": "archivedate"})
  table = form.find_next_sibling("table")
  rows = table.select("tr")[1:]

  time=[]
  temp=[]
  minutes=[]

  # handle no data case
  if soup.find(text="Archive data not available for this date."):
    print("Data not available, URL: '%s'" % url)
    return None

  # capture time and temps
  for row in rows:
    data = [td.text for td in row.find_all("td")]

    match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
    if match:
      temp.append(match.group())
      time.append(data[0])
      minutes.append(data[0][-4:-2])

  common = Counter(minutes).most_common()[0][0]

  finalTimes = []
  finalTemps = []
  for i in range(0,len(time)):
    if minutes[i] == common:
      finalTimes.append(time[i])
      finalTemps.append(int(temp[i]))
  dailyAve = sum(finalTemps) / float(len(finalTimes))
  return dailyAve

def writeToCsv(list1, list2, list3, list4, list5, list6, list7, list8):
  with open('results.csv', 'wb') as csvfile:
    results = csv.writer(csvfile, delimiter=',')
    results.writerow(['T-SJ', 'T- RB', 'T-DS', 'T-JW', 'T-GB', 'D', 'M', 'Y'])
    for idx in range(0,len(list1)):
      results.writerow([str(list1[idx]), str(list2[idx]), str(list3[idx]), str(list4[idx]), str(list5[idx]), str(list6[idx]), str(list7[idx]), str(list8[idx])])

def buildURL(location, day, month, year):
  if day < 10:
    strDay = '0'+str(day)
  else:
    strDay = str(day)

  baseURL  = "http://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=" + location.zip + "&pands=" + location.city + "%2" + "C" + location.state + "&place=" + location.city + "&state=" + location.state + "&icao=" + location.code + "&country=us&month=" + str(month) + "&day=" + strDay + "&year=" + str(year) + "&dosubmit=Go"
  return baseURL

def main():
  loc1 = SiteLocation('Farmington','NM','87401','KFMN')
  loc2 = SiteLocation('Whitesville','WV','25209','KBKW')
  loc3 = SiteLocation('Rangely','CO','81648','KVEL')
  loc4 = SiteLocation('Brookwood','AL','35444','KTCL')
  loc5 = SiteLocation('Princeton','IN','47670','KAJG')

  start = '2016/08/31'
  end = datetime.date.today()

  dateRange = list(daterange(start, end))

  listDailyAve1 = []
  listDailyAve2 = []
  listDailyAve3 = []
  listDailyAve4 = []
  listDailyAve5 = []

  listDays = []
  listMonths = []
  listYears = []

  for idx in range(0,len(dateRange)):
    strDate = str(dateRange[idx]).split("/")
    year = strDate[0]
    month = strDate[1] 
    day = strDate[2]

    url1 = buildURL(loc1, day, month, year)
    url2 = buildURL(loc2, day, month, year)
    url3 = buildURL(loc3, day, month, year)
    url4 = buildURL(loc4, day, month, year)
    url5 = buildURL(loc5, day, month, year)

    dailyAve1 = getDailyAve(url1)
    dailyAve2 = getDailyAve(url2)
    dailyAve3 = getDailyAve(url3)
    dailyAve4 = getDailyAve(url4)
    dailyAve5 = getDailyAve(url5)

    listDailyAve1.append(dailyAve1)
    listDailyAve2.append(dailyAve2)
    listDailyAve3.append(dailyAve3)
    listDailyAve4.append(dailyAve4)
    listDailyAve5.append(dailyAve5)

    listDays.append(day)
    listMonths.append(month)
    listYears.append(year)

  writeToCsv(listDailyAve1, listDailyAve2, listDailyAve3, listDailyAve4,listDailyAve5, listDays, listMonths, listYears)

if __name__ == '__main__':
  status = main()
  sys.exit(status)

以下是抛出的异常:

Traceback (most recent call last):
  File ".\weatherScrape2.py", line 147, in <module>
    status = main()
  File ".\weatherScrape2.py", line 128, in main
    dailyAve1 = getDailyAve(url1)
  File ".\weatherScrape2.py", line 61, in getDailyAve
    match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
IndexError: list index out of range

1 个答案:

答案 0 :(得分:0)

首先,您需要处理没有可用数据时的情况。这是一种方式:

# handle "no data" case
if soup.find(text="Archive data not available for this date."):
    print("Data not available, URL: '%s'." % url)
    return None

另外,我认为获取rows的逻辑存在问题。我这样做:

form = soup.find("form", {"id": "archivedate"})

table = form.find_next_sibling("table")
rows = table.select("tr")[1:]

以下是我执行的完整代码段(针对单个网址):

import requests
from bs4 import BeautifulSoup
from collections import Counter
import re


def getDailyAve(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    form = soup.find("form", {"id": "archivedate"})
    table = form.find_next_sibling("table")
    rows = table.select("tr")[1:]

    time = []
    temp = []
    minutes = []

    # handle no data case
    if soup.find(text="Archive data not available for this date."):
        print("Data not available, URL: '%s'" % url)
        return None

    # capture time and temps
    for row in rows:
        data = [td.text for td in row.find_all("td")]

        match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])", data[2])
        if match:
            temp.append(match.group())
            time.append(data[0])
            minutes.append(data[0][-4:-2])

    common = Counter(minutes).most_common()[0][0]

    finalTimes = []
    finalTemps = []
    for i in range(0, len(time)):
        if minutes[i] == common:
            finalTimes.append(time[i])
            finalTemps.append(int(temp[i]))
    dailyAve = sum(finalTemps) / float(len(finalTimes))
    return dailyAve


print(getDailyAve("https://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=87401&pands=Farmington%2CNM&place=Farmington&state=NM&icao=KFMN&country=us&month=09&day=03&year=2016&dosubmit=Go"))