使用python Selenium提取数据

时间:2017-09-01 05:13:16

标签: python-3.x selenium

Hey Guys我已经自动化了脚本以从下面的网站中提取内容。

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from datetime import datetime
import dateutil.parser
import urllib2
import requests
import sys
import re, clr
import csv
import pandas as pd
from selenium import webdriver
from ast import literal_eval
import shorten

chrome_path = r"/usr/bin/chromedriver"
driver = webdriver.Chrome(chrome_path)
driver.get("http://snowload.atcouncil.org/")  # opening the site
driver.find_element_by_xpath(
    """//*[@id="adminForm"]/fieldset/div/div[2]/div[2]/label""").click()  # click the radio button
driver.find_element_by_xpath("""//*[@id="coordinate_address"]""").click()  # clicking the textbox
cities = [' Aguila,Arizona']  # city list
for city in cities:
    # print (city)
    driver.find_element_by_xpath('//*[@id="coordinate_address"]').send_keys(city)  # passing cities
    driver.find_element_by_xpath('//*[@id="adminForm"]/fieldset/div/div[2]/button').click()
x = driver.current_url
# print x
Data = {'optionCoordinate': '2', 'coordinate_address': cities}
page = requests.post(x, data=Data)
soup = BeautifulSoup(page.content, 'html.parser')
div = soup.find('div', attrs={'class': 'span5'})
data = []
new_list = []

def remove_from_list(l, x):
    new_list = [li.replace(x, '') for li in l]
    return new_list

def flatten(iterable):
    result = []

    for item in iterable:
        try:
            item_eval = literal_eval(item)
            if not isinstance(item_eval, list):
                raise ValueError()
        except (ValueError, SyntaxError):
            result.append(item)
        else:
            result.extend(flatten(item_eval))

    return result


def shorten(s, subs):
    i = s.index(subs)
    return s[:i + len(subs)]


csvList = []

for li in div.find_all('li', limit=6):
    extracted_data = li.get_text().encode("ascii", "ignore")
    data = extracted_data.split(":")
    data = remove_from_list(data, u'\xa0')
    data = remove_from_list(data, u'\r')
    data = remove_from_list(data, u'\n')
    #csvList.append(str(data))
    if len(data) is 2:
        print str(data[1])
        csvList.append(str(data[1]))
    else:
        print str(data[1:3])
        csvList.append(str(data[1:3]))
csvList = flatten(csvList)
csvList = [str(csvList[x]) for x in range(len(csvList))]
datetime_object = datetime.strptime(csvList[0], '%B %d, %Y').date()
datetime_object = datetime.strptime(str(datetime_object), '%Y-%m-%d').strftime('%d/%m/%y')
print datetime_object
csvList[0] = str(datetime_object)
csvList[4] = re.sub("\D","",csvList[4])
csvList[6] = re.sub("\D","",csvList[6])
#head, sep, tail = csvList[5].partition('Any')
#csvList[5] = head
csvList[5] = shorten(csvList[5], 'Load')

#del csvList[4]
print csvList
with open('/home/priyanka/Desktop/output.csv','w') as file:
    wr = csv.writer(file, quoting=csv.QUOTE_ALL)
    wr.writerow(csvList)

driver.close()

此代码将提取内容。但我需要的是基于海拔: 2170.4英尺内容 它应该只需海拔≤3,000英尺:地面雪荷载为0 psf 在这三个中。

海拔高度≤3,000英尺:地面雪荷载为0 psf

海拔> 3,000和≤4,500英尺:地面雪荷载为5 psf

海拔> 4,500和≤5,400英尺:地面雪荷载为10 psf

所以基于Elevation值,它应该与Elevation feet进行比较,并取相应的内容

最后输出应该看起来像

['01/09/17','Aguila,Arizona','33 .9428069',' - 113.1740805','21704','ASCE 7 *地面积雪','0']

请帮帮我们。在此先感谢。

0 个答案:

没有答案