我有以下网址
http://water.weather.gov/ahps2/crests.php?wfo=lch&gage=bsll1&crest_type=historic
并希望将其转换为csv或任何表格形式
答案 0 :(得分:2)
从网站提取数据的过程称为webscraping。
这段代码可以帮助您:
from graphics import *
def isBetween(x, end1, end2):
return end1 <= x <= end2 or end2 <= x <= end1
def isInside(point, startImage):
x = startImage.getAnchor().getX()
y = startImage.getAnchor().getY()
w = startImage.getWidth()/2
h = startImage.getHeight()/2
pt1_x = x - w
pt1_y = y - h
pt2_x = x + w
pt2_y = y + h
return isBetween(point.getX(), pt1_x, pt2_x) and \
isBetween(point.getY(), pt1_y, pt2_y)
def getChoice(event):
global hour, minute, sec
global running
point = Point(round(event.x), round(event.y))
if isInside(point, startImage):
sec = 0
minute = 0
hour = 0
running = True
update_time()
if isInside(point, stopImage):
running = False
def update_time():
global hour, minute, sec
#global timeText
#global win
sec += 1
if sec == 60:
sec = 0
minute += 1
if minute == 60:
minute = 0
hour += 1
timeText.setText('{}:{}:{}'.format(hour, minute, sec))
if running:
win.after(1000, update_time)
else:
timeText.setText('')
def layout():
global win
global stopWatchImage
global startImage
global stopImage
global lapImage
global timeText
win = GraphWin('Stopwatch', 600, 600)
#win.yUp()
#Assigning images
stopWatchImage = Image(Point(300, 300), "stopwatch.png")
startImage = Image(Point(210, 170), "startbutton.png")
stopImage = Image(Point(390, 170), "stopbutton.png")
lapImage = Image(Point(300, 110), "lapbutton.png")
#Drawing images
stopWatchImage.draw(win)
startImage.draw(win)
stopImage.draw(win)
lapImage.draw(win)
timeText = Text(Point(300,260), '')
timeText.setSize(30)
timeText.draw(win)
win.setMouseHandler(getChoice)
win.getKey()
# --- global variable ---
win = None
stopWatchImage = None
startImage = None
stopImage = None
lapImage = None
timeText = None
running = False
# --- start ---
layout()
这是我from bs4 import BeautifulSoup
import urllib2
url = 'http://water.weather.gov/ahps2/crests.php?wfo=lch&gage=bsll1&crest_type=historic'
#read html page using urlopen() method
r = urllib2.urlopen(url).read()
#create soup to navigate through tags
soup = BeautifulSoup(r, 'lxml')
#find the data inside the div mark, under the water_information class tag
results = soup.find('div', {'class':'water_information'})
#get only text from the results soup
water_data = results.text
#write this info to an output file
with open('outputfile.txt', 'w') as f:
f.write(water_data)
的内容示例:
outputfile.txt
现在,您可以使用Historic Crests
(1) 34.39 ft on 05/20/1953
(2) 31.74 ft on 02/07/1955
(3) 31.08 ft on 08/11/1940
(4) 30.65 ft on 11/01/1985
(5) 29.59 ft on 04/14/1995
(6) 26.99 ft on 07/04/1989
(7) 26.46 ft on 09/23/1979
(8) 26.22 ft on 12/30/1982
(9) 26.10 ft on 10/31/2002
(10) 26.06 ft on 01/13/2013
和water_data
轻松使用regex
字符串,以创建自己的CSV文件。
你没想到我会为你写的一切,对吗? split()
答案 1 :(得分:0)
使用requests和lxml:
import requests
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import string
# download response
response = requests.get('http://water.weather.gov/ahps2/crests.php?wfo=lch&gage=bsll1&crest_type=historic')
html = response.text
您现在拥有原始html文本。你需要删除这个标签。这里我们使用lxml,一个python库来处理HTML / XML文本。 fromstring()函数用于将字符串解析为元素。
#clean up
doc = fromstring(html)
tags = ['h1','h2','h3','h4','h5','h6',
'div', 'span',
'img', 'area', 'map']
args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False,
'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}
确定要删除的标记。 Cleaner类清除违规标签的html文档 - 因此我们创建一个更干净的对象,传递要列入黑名单的类变量列表(以及要删除的标记)。默认情况下,请参阅lxml Cleaner class documentation了解每个属性的设置。请注意,remove_tags
仅剥离标记,而不删除内容。
cleaner = Cleaner(**args)
path = '/html/body'
body = doc.xpath(path)[0] #only interested in the body of the response
clean_response = cleaner.clean_html(body).text_content() #clean!
# split into lines.
table = clean_response.splitlines()
#parse whichever way you wish to
#your code here