我很难尝试将html表转换为数据帧。我想在csv文件中写表。
from requests import session
import sys
import csv
from bs4 import BeautifulSoup
c = session()
outfile = open("Weather2017.csv", 'wb')
response = c.get('http://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA')
soup = BeautifulSoup(response.text, "html.parser")
soup = soup.find(id="obsTable").text.replace('\n','',1)
outfile.write(soup.replace('\n',',London\n'))
类型错误如下
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-1e149d844e15> in <module>()
11 soup = BeautifulSoup(response.text, "html.parser")
12 soup = soup.find(id="obsTable").text.replace('\n','',1)
---> 13 outfile.write(soup.replace('\n',',London\n'))
14
15
TypeError: a bytes-like object is required, not 'str'
这是我想要转换为csv文件的表
任何人都可以帮助我吗?
提前致谢!
答案 0 :(得分:0)
这个怎么样,
from requests import session
import sys
import csv
from bs4 import BeautifulSoup
c = session()
response = c.get('http://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA')
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find(id="obsTable")
headers = [header.text.encode('utf-8').strip() for header in table.find_all('th')]
rows = []
for row in table.find_all('tr'):
rows.append([val.text.encode('utf-8').strip() for val in row.find_all('td')])
del rows[0] # Remove header row. Added as empty.
with open('Weather2017.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(row for row in rows if row)
答案 1 :(得分:0)
应用BeautifulSoup()时代码中出现问题的是这些标记: tbody,/ tbody,thead,/ thead。如果你摆脱它们一切都会好起来的!
这是使用pandas,regex和其他一些库的解决方案:)
#needed imports
import pandas as pd
import numpy as numpy
from bs4 import BeautifulSoup
import requests
import re
# get page html code
url = 'https://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA'
req = requests.get(url)
html = req.text
soup = (BeautifulSoup(html, 'html.parser'))
#removing tags that cause problems using re library
patterns = ['<tbody>','</tbody>','<thead>','</thead>']
cleaned_html= soup.prettify()
for pat in patterns:
cleaned_html = re.sub(pat, '', cleaned_html)
df = pd.read_html(cleaned_html, attrs={'id':'obsTable'})[0]
df.head()
# build a hierarchical columns
df.columns = [['2017',
'Temp. (°C)','Temp. (°C)','Temp. (°C)',
'Dew Point (°C)','Dew Point (°C)','Dew Point (°C)',
'Humidity (%)','Humidity (%)','Humidity (%)',
'Sea Level Press. (hPa)','Sea Level Press. (hPa)','Sea Level Press. (hPa)',
'Visibility (km)','Visibility (km)','Visibility (km)',
'Wind (km/h)', 'Wind (km/h)','Wind (km/h)',
'Precip. (mm)', 'Events'],
['Jan',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','high',
'sum',
'nan']]
df.head()
#removing the first un-needed rows
df = df.drop([0,1], axis=0)
df.reset_index(inplace=True, drop=True)
df.head()
#save the result to CSV file
df.to_csv('weather.csv')