我的列表索引超出范围错误,不知道为什么。我的代码是一个webscraper,用于从网站收集温度数据。直到最近,所有工作都好几个月。
我有很多功能如下所示。重要的是getDailyAve()
,这是我引发异常的地方。
感谢任何想法或建议。
import sys
import urllib
from bs4 import BeautifulSoup
from urllib2 import urlopen, URLError
import webbrowser
import time
from collections import Counter
import numpy as np
import re
import csv
import datetime
from datetime import timedelta
DATE_FORMAT = '%Y/%m/%d'
def daterange(start, end):
def convert(date):
try:
date = datetime.datetime.strptime(date, DATE_FORMAT)
return date.date()
except TypeError:
return date
def get_date(n):
return datetime.datetime.strftime(convert(start) + timedelta(days=n), DATE_FORMAT)
days = (convert(end) - convert(start)).days
if days <= 0:
raise ValueError('The start date must be before the end date.')
for n in range(0, days):
yield get_date(n)
class SiteLocation:
"""class defining mine location parameters to lookup on weather search"""
def __init__(self, city, state, zip, code):
self.city = city
self.state = state
self.zip = zip
self.code = code
def getDailyAve(url):
url = urllib.urlopen(url)
soup = BeautifulSoup(url.read(), 'lxml')
form = soup.find("form",{"id": "archivedate"})
table = form.find_next_sibling("table")
rows = table.select("tr")[1:]
time=[]
temp=[]
minutes=[]
# handle no data case
if soup.find(text="Archive data not available for this date."):
print("Data not available, URL: '%s'" % url)
return None
# capture time and temps
for row in rows:
data = [td.text for td in row.find_all("td")]
match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
if match:
temp.append(match.group())
time.append(data[0])
minutes.append(data[0][-4:-2])
common = Counter(minutes).most_common()[0][0]
finalTimes = []
finalTemps = []
for i in range(0,len(time)):
if minutes[i] == common:
finalTimes.append(time[i])
finalTemps.append(int(temp[i]))
dailyAve = sum(finalTemps) / float(len(finalTimes))
return dailyAve
def writeToCsv(list1, list2, list3, list4, list5, list6, list7, list8):
with open('results.csv', 'wb') as csvfile:
results = csv.writer(csvfile, delimiter=',')
results.writerow(['T-SJ', 'T- RB', 'T-DS', 'T-JW', 'T-GB', 'D', 'M', 'Y'])
for idx in range(0,len(list1)):
results.writerow([str(list1[idx]), str(list2[idx]), str(list3[idx]), str(list4[idx]), str(list5[idx]), str(list6[idx]), str(list7[idx]), str(list8[idx])])
def buildURL(location, day, month, year):
if day < 10:
strDay = '0'+str(day)
else:
strDay = str(day)
baseURL = "http://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=" + location.zip + "&pands=" + location.city + "%2" + "C" + location.state + "&place=" + location.city + "&state=" + location.state + "&icao=" + location.code + "&country=us&month=" + str(month) + "&day=" + strDay + "&year=" + str(year) + "&dosubmit=Go"
return baseURL
def main():
loc1 = SiteLocation('Farmington','NM','87401','KFMN')
loc2 = SiteLocation('Whitesville','WV','25209','KBKW')
loc3 = SiteLocation('Rangely','CO','81648','KVEL')
loc4 = SiteLocation('Brookwood','AL','35444','KTCL')
loc5 = SiteLocation('Princeton','IN','47670','KAJG')
start = '2016/08/31'
end = datetime.date.today()
dateRange = list(daterange(start, end))
listDailyAve1 = []
listDailyAve2 = []
listDailyAve3 = []
listDailyAve4 = []
listDailyAve5 = []
listDays = []
listMonths = []
listYears = []
for idx in range(0,len(dateRange)):
strDate = str(dateRange[idx]).split("/")
year = strDate[0]
month = strDate[1]
day = strDate[2]
url1 = buildURL(loc1, day, month, year)
url2 = buildURL(loc2, day, month, year)
url3 = buildURL(loc3, day, month, year)
url4 = buildURL(loc4, day, month, year)
url5 = buildURL(loc5, day, month, year)
dailyAve1 = getDailyAve(url1)
dailyAve2 = getDailyAve(url2)
dailyAve3 = getDailyAve(url3)
dailyAve4 = getDailyAve(url4)
dailyAve5 = getDailyAve(url5)
listDailyAve1.append(dailyAve1)
listDailyAve2.append(dailyAve2)
listDailyAve3.append(dailyAve3)
listDailyAve4.append(dailyAve4)
listDailyAve5.append(dailyAve5)
listDays.append(day)
listMonths.append(month)
listYears.append(year)
writeToCsv(listDailyAve1, listDailyAve2, listDailyAve3, listDailyAve4,listDailyAve5, listDays, listMonths, listYears)
if __name__ == '__main__':
status = main()
sys.exit(status)
以下是抛出的异常:
Traceback (most recent call last):
File ".\weatherScrape2.py", line 147, in <module>
status = main()
File ".\weatherScrape2.py", line 128, in main
dailyAve1 = getDailyAve(url1)
File ".\weatherScrape2.py", line 61, in getDailyAve
match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
IndexError: list index out of range
答案 0 :(得分:0)
首先,您需要处理没有可用数据时的情况。这是一种方式:
# handle "no data" case
if soup.find(text="Archive data not available for this date."):
print("Data not available, URL: '%s'." % url)
return None
另外,我认为获取rows
的逻辑存在问题。我这样做:
form = soup.find("form", {"id": "archivedate"})
table = form.find_next_sibling("table")
rows = table.select("tr")[1:]
以下是我执行的完整代码段(针对单个网址):
import requests
from bs4 import BeautifulSoup
from collections import Counter
import re
def getDailyAve(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
form = soup.find("form", {"id": "archivedate"})
table = form.find_next_sibling("table")
rows = table.select("tr")[1:]
time = []
temp = []
minutes = []
# handle no data case
if soup.find(text="Archive data not available for this date."):
print("Data not available, URL: '%s'" % url)
return None
# capture time and temps
for row in rows:
data = [td.text for td in row.find_all("td")]
match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])", data[2])
if match:
temp.append(match.group())
time.append(data[0])
minutes.append(data[0][-4:-2])
common = Counter(minutes).most_common()[0][0]
finalTimes = []
finalTemps = []
for i in range(0, len(time)):
if minutes[i] == common:
finalTimes.append(time[i])
finalTemps.append(int(temp[i]))
dailyAve = sum(finalTemps) / float(len(finalTimes))
return dailyAve
print(getDailyAve("https://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=87401&pands=Farmington%2CNM&place=Farmington&state=NM&icao=KFMN&country=us&month=09&day=03&year=2016&dosubmit=Go"))