我正试图从wunderground.com获取BeautifulSoup的4年小时数据。但是,当我使用此代码时,我遇到了一些问题。问题是每小时只显示一行。它分离3行(如附件)。当我尝试过滤数据时,很难研究它。我希望将每个数据放在一行中。你能帮帮我吗?
我的代码:
import urllib.request as urllib2, re, time
# importing parser
from bs4 import BeautifulSoup
f = open('weather.txt', 'w')
# Start and end year of simulation
for y in range(2009, 2013):
# Type the months that you want to extract
# For example for January and February use range(1,3)
for m in range(1, 13):
#checking for leap years
for d in range(1,32):
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
if (m == 2 and leap and d > 29):
continue
elif (m == 2 and d > 28):
continue
elif (m in [4, 6, 9, 10] and d > 30):
continue
url = "http://www.wunderground.com/history/airport/LTBA/" + str(y) + "/" + str(m) + "/" + str(d) + "/DailyHistory.html"
page = urllib2.urlopen(url)
#opening the website with Beautiful Soup
soup = BeautifulSoup(page)
# finding section with observation details
paragList = soup.findAll(id="observations_details")
counter = 0
counter_max = 0 # maximum number of columns
# adding a zero to one digit numbers
string = ''
if len(str(m)) < 2:
mStamp = '0' + str(m)
else:
mStamp = str(m)
if len(str(d)) < 2:
dStamp = '0' + str(d)
else:
dStamp = str(d)
# time stamp is four digit year, two digit month, and two digit day
timestamp = str(y) + mStamp + dStamp
print(timestamp)
for i in paragList:
# writing in text file the header with the name of each column
headList = i.findAll('th')
f.write('DATE,')
for k in headList:
h_element = k.text
s = str(h_element)
f.write(s)
f.write(',')
counter_max = counter_max + 1
f.write(' \n')
# writing in text file each row with data
tableList = i.findAll('tbody')
for l in tableList:
bodyList = l.findAll('td')
for j in bodyList:
if counter == 0:
f.write(timestamp + ',')
if j.string:
element = j.text
# print (element)
s = str(element)
f.write(s)
f.write(',')
else:
elementList = j.findAll(j.b) + j.findAll('b')
for k in elementList:
if k.string:
element = k.text
# print(element)
s = str(element)
f.write(s)
f.write(',')
counter = counter + 1
if counter == counter_max:
# print
# print("************************")
# print("**** NEXT RECORD *******")
# print("**** *******************")
f.write('\n')
counter = 0
# print ("\n")
# print("**** NEXT YEAR *******")
f.close()
示例输出: