我正在尝试获取div类" credit-list linelist"
中存在的数据来自网站
更新了代码,
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import csv
import re
import json
import sys
import psycopg2
from pyvirtualdisplay import Display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
def resultSetGetter(databasename, ip, username, pw):
States = ['Nevada']
StateList = ','.join('?' for i in range(len(States))) # '?,?'
try:
con = psycopg2.connect(database=databasename, user=username, password=pw, host=ip, port=5432)
cur = con.cursor()
# newPrimaryCity = '\'' + primary_city + '\''
# queryString = "select DISTINCT primary_city from loc_zip_code_details where primary_city = " + newPrimaryCity + ";"
# print queryString
queryString = "select distinct(loc_zip_code_details.primary_city),state_name from loc_zip_code_details,loc_state where loc_zip_code_details.state_code = loc_state.state_code and loc_state.state_name IN (%s) ORDER BY state_name,loc_zip_code_details.primary_city limit 1"
in_p = ', '.join(list(map(lambda x: '%s', States)))
sql = queryString % in_p
cur.execute(sql, States)
# print cur.execute(queryString)
# print queryString
zipCodes = cur.fetchall()
print zipCodes
return zipCodes
# zipCodes= [x for x in foo if x!= ("Alba", "Texas")]
con.close()
# print zipCodes
except psycopg2.Error as leed_Error:
print leed_Error
def flatten(x):
result = []
for el in x:
if hasattr(el, "__iter__") and not isinstance(el, basestring):
result.extend(flatten(el))
else:
result.append(el)
return result
def leed_data(zipCodes):
for i in zipCodes:
driver = webdriver.Chrome(chrome_path)
time.sleep(3)
driver.get("http://www.usgbc.org/rpc")
driver.find_element_by_xpath('//*[@id="mainCol"]/div[1]/div[1]').click()
time.sleep(3)
driver.find_element_by_xpath('//*[@id="mainCol"]/div[1]/div[1]/ul/li[10]').click()
time.sleep(3)
driver.find_element_by_xpath('//*[@id="mainCol"]/div[1]/div[2]').click()
time.sleep(1)
driver.find_element_by_xpath('//*[@id="mainCol"]/div[1]/div[2]/ul/li[2]').click()
time.sleep(1)
driver.find_element_by_xpath('//*[@id="edit-address"]').clear()
# print i
driver.find_element_by_xpath('//*[@id="edit-address"]').send_keys(i)
time.sleep(3)
driver.find_element_by_xpath('//*[@id="geocode"]/div/div[1]/div[2]').click()
time.sleep(3)
driver.find_element_by_xpath('//*[@id="search-text"]').click()
time.sleep(3)
# alpha = driver.find_elements_by_class_name('views-field-field-category-logo-fid')
# temp = driver.find_element_by_xpath('//div[@class="credit-list linelist"]').extract()
# print alpha.text()
html_list = driver.find_element_by_xpath('//*[@id="mainCol"]/div[5]/ul')
items = html_list.find_elements_by_tag_name("li")
a, b, c, d, e, a1, b1, c1, d1, newList, e1 = [], [], [], [], [], [], [], [], [], [], []
for item in items:
txt = item.text.split('\n')
txt.append(i)
# print txt
txt[3] = txt[3].split(',')
newList = flatten(txt)
print newList
# txt = flatten(txt)
# print txt
# a1=re.search(pattern, txt[2]).group(0)
# b1=re.search(pattern, txt[1]).group(0)
newList[2] = int(''.join(w for w in txt[2] if w.isdigit()))
newList[1] = int(''.join(w for w in txt[1] if w.isdigit()))
a1 = newList[2]
b1 = newList[1]
c1=newList[0]
d1=newList[3]
e1=newList[4]
#d1=d1.split(',')
#print d1
#txt.append(d1)
# newList = newList.append(txt)
print newList
a.append(a1)
b.append(b1)
c.append(c1)
d.append(d1)
e.append(e1)
# print i
df = pd.DataFrame({'col1': a, 'col2': e, 'col5': d, 'col3':b, 'col4': c})
print df
driver.close()
#appendCsv = pd.DataFrame()
#appendCsv.append(df,ignore_index=True)
#new_df = pd.concat(appendCsv,ignore_index=True)
df.to_csv('LEED_NEVADA1.csv', index=False, header=False)
#return df
if __name__ == '__main__':
databasename = ""
ip = ""
username = ""
pw = ""
# print databasename,ip,username,pw
zipCodes = resultSetGetter(databasename, ip, username, pw)
zipCodes = [','.join(map(lambda x: x.replace(' ', ''), y)) for y in zipCodes]
# display = Display(visible=0, size=(800, 600))
# display.start()
chrome_options = Options()
chrome_options.add_experimental_option('prefs', {
'credentials_enable_service': False,
'profile': {
'password_manager_enabled': False
}
})
chrome_path = r"/usr/bin/chromedriver"
finalList = leed_data(zipCodes)
#print finalList
#df = pd.DataFrame(finalList)
#finalList.to_csv('Leed1.csv', index=False, header=False)
我的输入文件将是数据库中的CSV,
"利特,亚利桑那" "吉尔伯特,Ariznona"
我的输出就像,
0 20 Nevada 30 Annual energy use Alamo
1 3 Nevada 5 Efficient hot water distribution system Alamo
2 1 Nevada 1 No environmental tobacco smoke Alamo
3 2 Nevada 3 Compact development Alamo
4 2 Nevada 3 Construction waste management Alamo
5 7 Nevada 12 Total water use Alamo
6 4 Nevada 6 Indoor water use Alamo
0 20 Nevada 30 Annual energy use AmargosaValley
1 1 Nevada 1 No environmental tobacco smoke AmargosaValley
2 2 Nevada 3 Compact development AmargosaValley
3 1 Nevada 2 Community resources AmargosaValley
4 2 Nevada 3 Construction waste management AmargosaValley
5 7 Nevada 12 Total water use AmargosaValley
6 4 Nevada 6 Indoor water use AmargosaValley
我想将该输出附加到csv。
答案 0 :(得分:0)
您可以使用regex
+ pandas dataframe
或string-digit-check
+ pandas dataframe
:
%%timeit
html_list = driver.find_element_by_xpath('//*[@id="mainCol"]/div[5]/ul')
items = html_list.find_elements_by_tag_name("li")
a,b,c,a1,b1,c1 = [],[],[],[],[],[]
#pattern = re.compile(r'\d+')
for item in items:
txt = item.text.split('\n')
#a1=re.search(pattern, txt[2]).group(0)
#b1=re.search(pattern, txt[1]).group(0)
a1=int(''.join(w for w in txt[2] if w.isdigit()))
b1=int(''.join(w for w in txt[1] if w.isdigit()))
c1=txt[0]
a.append(a1)
b.append(b1)
c.append(c1)
df = pd.DataFrame({'col1': a, 'col2': 'Littlefield Arizona', 'col3':b, 'col4': c})
col1 col2 col3 col4
0 10 Littlefield Arizona 30 Annual energy use
1 2 Littlefield Arizona 3 Compact development
2 1 Littlefield Arizona 2 Access to transit
3 1 Littlefield Arizona 2 Heat island reduction
4 2 Littlefield Arizona 3 Rainwater management
5 6 Littlefield Arizona 12 Total water use
6 4 Littlefield Arizona 6 Indoor water use
114 ms ± 3.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
我评论了另一种方法:使用正则表达式进行预编译并搜索正则表达式。