我正在尝试检索当地天气站点的日温度。 我使用BeautifulSoup构建了这个循环。 不幸的是,循环在第一轮之后就中断了。
这是我的代码和结果: 代码:
#coding: latin-1
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
# create a file zam-data.txt
# seperated with komma
f = open('zamg-data.txt','w')
# start webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
#loop through month and days
for m in range(1,13):
for d in range (1, 32):
# was the last day in a month
if (m==2 and d>28):
break
elif (m in [4,6,9,11] and d>30):
break
#open zamg site
timestamp = '2019' +'-'+ str(m) +'-'+ str(d)
print("call page of "+timestamp)
url = "https://www.zamg.ac.at/cms/de/klima/klima-aktuell/klimamonitoring/?param=t&period=period-ymd-"+timestamp
driver.get(url)
# extract temprature
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, "html.parser")
data = soup.find_all(class_='u-txt--big')[1].string
print(len(data))
print(data + '...okay')
# format month for timestamp
if(len(str(m)) < 2):
mStamp = '0'+str(m)
else:
mStamp = str(m)
# format day for timestamp
if(len(str(d)) < 2):
dStamp = '0'+ str(d)
else:
dStamp = str(d)
# timestamp
timestamp = '2019' + mStamp + dStamp
# write time and value
f.write(timestamp + ',' + data + '\n')
# data is extracted - close
f.close()
我的结果:
➜ weather-app python get-data-02.py
call page of 2019-1-1
5
+3,9 ...okay
call page of 2019-1-2
Traceback (most recent call last):
File "get-data-02.py", line 37, in <module>
data = soup.find_all(class_='u-txt--big')[1].string
IndexError: list index out of range
➜ weather-app
我不明白这里出了什么问题。第二页已加载到浏览器中,但随后中断 有任何想法吗?
答案 0 :(得分:0)
#coding: latin-1
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
import time
base = datetime.datetime(2019,1,1).date()
date_list = [base + datetime.timedelta(days=x) for x in range(365)]
# start webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
base_url = "https://www.zamg.ac.at/cms/de/klima/klima-aktuell/klimamonitoring/?param=t&period=period-ymd-"
with open('zamg-data.txt','w') as file:
for dt in date_list:
timestamp = dt.strftime("%Y-%m-%d")
print("call page of "+timestamp)
url = f"{base_url}{timestamp}"
driver.get(url)
WebDriverWait(driver, timeout=40).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "u-txt--big")))
# extract temprature
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, "html.parser")
data = soup.find_all(class_='u-txt--big')[1].string
print(len(data))
print(data + '...okay')
# timestamp
timestamp_1 = dt.strftime("%Y%m%d")
# write time and value
file.write(timestamp_1 + ',' + data + '\n')
time.sleep(3)
driver.quit()
print("Done!!!")
正如评论部分中提到的那样,您需要使浏览器等待直到检测到该类的所有元素。在每次加载页面后,我都添加了明确的时间延迟,以使网站不会被请求所淹没。这是使您的IP被阻止的一种潜在方法。最好始终使用上下文管理器。