如何在Python中将.php.html转换为csv

时间:2016-11-21 20:58:48

标签: python python-3.x

我有以下网址

http://water.weather.gov/ahps2/crests.php?wfo=lch&gage=bsll1&crest_type=historic

并希望将其转换为csv或任何表格形式

2 个答案:

答案 0 :(得分:2)

从网站提取数据的过程称为webscraping

这段代码可以帮助您:

from graphics import *


def isBetween(x, end1, end2):
    return end1 <= x <= end2 or end2 <= x <= end1

def isInside(point, startImage):
    x = startImage.getAnchor().getX()
    y = startImage.getAnchor().getY()
    w = startImage.getWidth()/2
    h = startImage.getHeight()/2

    pt1_x = x - w
    pt1_y = y - h

    pt2_x = x + w
    pt2_y = y + h

    return isBetween(point.getX(), pt1_x, pt2_x) and \
           isBetween(point.getY(), pt1_y, pt2_y)


def getChoice(event):
    global hour, minute, sec
    global running

    point = Point(round(event.x), round(event.y))

    if isInside(point, startImage):
        sec = 0
        minute = 0
        hour = 0
        running = True
        update_time()

    if isInside(point, stopImage):
        running = False


def update_time():
    global hour, minute, sec
    #global timeText
    #global win

    sec += 1

    if sec == 60:
        sec = 0        
        minute += 1
        if minute == 60:
            minute = 0
            hour += 1

    timeText.setText('{}:{}:{}'.format(hour, minute, sec))

    if running:
        win.after(1000, update_time)
    else:
        timeText.setText('')


def layout():
    global win
    global stopWatchImage
    global startImage
    global stopImage
    global lapImage
    global timeText

    win = GraphWin('Stopwatch', 600, 600)
    #win.yUp()

    #Assigning images
    stopWatchImage = Image(Point(300, 300), "stopwatch.png")
    startImage = Image(Point(210, 170), "startbutton.png")
    stopImage = Image(Point(390, 170), "stopbutton.png")
    lapImage = Image(Point(300, 110), "lapbutton.png")

    #Drawing images
    stopWatchImage.draw(win)
    startImage.draw(win)
    stopImage.draw(win)
    lapImage.draw(win)

    timeText = Text(Point(300,260), '')
    timeText.setSize(30)
    timeText.draw(win)

    win.setMouseHandler(getChoice)

    win.getKey()

# --- global variable ---

win = None

stopWatchImage = None
startImage = None
stopImage = None
lapImage = None

timeText = None

running = False

# --- start ---

layout()

这是我from bs4 import BeautifulSoup import urllib2 url = 'http://water.weather.gov/ahps2/crests.php?wfo=lch&gage=bsll1&crest_type=historic' #read html page using urlopen() method r = urllib2.urlopen(url).read() #create soup to navigate through tags soup = BeautifulSoup(r, 'lxml') #find the data inside the div mark, under the water_information class tag results = soup.find('div', {'class':'water_information'}) #get only text from the results soup water_data = results.text #write this info to an output file with open('outputfile.txt', 'w') as f: f.write(water_data) 的内容示例:

outputfile.txt

现在,您可以使用Historic Crests (1) 34.39 ft on 05/20/1953 (2) 31.74 ft on 02/07/1955 (3) 31.08 ft on 08/11/1940 (4) 30.65 ft on 11/01/1985 (5) 29.59 ft on 04/14/1995 (6) 26.99 ft on 07/04/1989 (7) 26.46 ft on 09/23/1979 (8) 26.22 ft on 12/30/1982 (9) 26.10 ft on 10/31/2002 (10) 26.06 ft on 01/13/2013 water_data轻松使用regex字符串,以创建自己的CSV文件。

你没想到我会为你写的一切,对吗? split()

答案 1 :(得分:0)

使用requests和lxml:

import requests
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import string


# download response
response = requests.get('http://water.weather.gov/ahps2/crests.php?wfo=lch&gage=bsll1&crest_type=historic')
html = response.text

您现在拥有原始html文本。你需要删除这个标签。这里我们使用lxml,一个python库来处理HTML / XML文本。 fromstring()函数用于将字符串解析为元素。

#clean up
doc = fromstring(html)
tags = ['h1','h2','h3','h4','h5','h6',
               'div', 'span', 
                      'img', 'area', 'map']
args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False, 
               'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}

确定要删除的标记。 Cleaner类清除违规标签的html文档 - 因此我们创建一个更干净的对象,传递要列入黑名单的类变量列表(以及要删除的标记)。默认情况下,请参阅lxml Cleaner class documentation了解每个属性的设置。请注意,remove_tags仅剥离标记,而不删除内容。

cleaner = Cleaner(**args)
path = '/html/body'
body = doc.xpath(path)[0] #only interested in the body of the response
clean_response = cleaner.clean_html(body).text_content() #clean!

# split into lines.
table = clean_response.splitlines()

#parse whichever way you wish to
#your code here