我正在建立一个站点,从多个来源提取数据,将其合并到一个数据库中,然后使用这些数据计算每个受尊重团队的数量。我对python,peewee和Selenium都很新。
以下是我的所有代码:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from models import *
import os # File I/O
import time
import shutil
import glob
import configparser
config_parser = configparser.ConfigParser()
config_parser.read("config.ini")
#Var
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
download_dir = os.path.abspath('./downloads/')
ids = ['XXXXXX', 'DDDDDD', 'WWWWWW', 'AAAAAAA', 'VVVVVV', 'FFFFFFF']
filelist = glob.glob(download_dir + '/*.html')
dbpath = ('./db')
def store_data(download_dir):
# if dbpath == ' ':
# print(dbpath)
# else:
# print('DataBase exists, will now attempt to remove!')
# os.remove('./db/budget.db')
# print('Removed')
database.connect()
database.create_tables([Charge], safe=False)
database.close()
for root, dir, files in os.walk(download_dir):
for file in files:
print(file)
file_markup = ''
with open(os.path.abspath(os.path.join(download_dir, file)), 'r') as html:
file_markup = html.read()
if file_markup == '':
print('ERROR: File was not read')
print('Reading {0} into BS4'.format(file))
soup = BeautifulSoup(file_markup, 'html.parser')
print('File parsed')
data = []
table = soup.find('table') #, attrs={'class':'lineItemsTable'}
# First 56 tr's are headings
rows = table.find_all('tr') # 18th row is header row
cols = rows[18].find_all('td')
cols = [ele.text.strip() for ele in cols]
#print('cols:')
#print(cols)
database.connect()
for row in rows[19:]:
d = row.find_all('td')
d = [ele.text.strip() for ele in d]
data.append([ele for ele in d if ele]) # Get rid of empty values
Charge.create(pmt_id=(d[1]),
prism_id=(d[2]),
owner=file.split('.')[0],
date=d[11],
reg_hours=float(d[17]),
ot_hours=float(d[18]),
rate=int(d[42]),
resource=(d[14]),
pmt_status=(d[24]),
resource_status=(d[15]))
database.close()
def load_home_page(driver):
driver.get('https://intra.att.com/cmpm/main.cfm')
elem = driver.find_element_by_css_selector('input[value="Show Options"]')
elem.click()
elem = driver.find_element_by_css_selector('input[value="Enable Link"]')
elem.click()
elem = driver.find_element_by_css_selector('input[name="successOK"]')
elem.click()
def type_supervisor_id(driver, supervisor):
elem = driver.find_element_by_css_selector('input[name="sattuid"]')
elem.clear()
elem.send_keys(supervisor)
def select_date(driver, date): #NEED IT TO SELECT STARTDATE AND CHANGE IT TO JAN 2018 below works as of Feb 7th
for date in months:
select = Select(driver.find_element_by_name('startdate'))
select.select_by_visible_text('Jan 2018')
def results_display(driver,results):
elem = driver.find_element_by_css_selector('select[name="DontDisplay"]')
elem.click()
ActionChains(driver).key_down(Keys.CONTROL).send_keys('a').key_up(Keys.CONTROL).perform()
driver.find_element_by_css_selector('img[src="/cmpmrptstatic/images/right.jpg"]').click()
def fetch_data():
opts = webdriver.ChromeOptions()
print('Download Directory: {0}'.format(download_dir))
prefs = {'download.default_directory' : download_dir}
opts.add_experimental_option('prefs', prefs)
print('Opening Chrome')
driver = webdriver.Chrome(chrome_options=opts)
print('Authenicating')
load_home_page(driver)
time.sleep(2)
print('Load CMPM home')
print('Opening CMPM Datamart reports')
print('elem clicked')
print('Attemting to switch to frame 0')
driver.switch_to_frame('main')
driver.find_element_by_css_selector('button[name="btndm"]').click()
#def pop_up():
print('New window should be opening')
wait_time = 60
try:
for handle in driver.window_handles:
driver.switch_to_window(handle)
print('Waiting for window to load, waiting {0} seconds'.format(wait_time))
elem = WebDriverWait(driver, wait_time).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[name="btnCDH006"]')))
elem = driver.find_element_by_css_selector('button[name="btnCDH006"]')
elem.click()
print('Found elem: {0}'.format(elem))
except Exception:
print('Something went wrong')
for id in ids:
print('Fililng out form for: ' +id)
type_supervisor_id(driver, id)
select_date(driver, months)
results_display(driver,results_display)
driver.find_element_by_css_selector('button[name="btnSubmit"]').click()
print('Sleeping for 5s')
time.sleep(5)
for root, dir, files in os.walk(download_dir):
for file in files:
if file[:2] == 'XL':
print('Renaming {0} to {1}'.format(file, id))
os.rename(os.path.abspath(os.path.join(download_dir, file)), os.path.abspath(os.path.join(download_dir, id+'.html')))
print('Waiting for window to load, waiting {0} seconds'.format(wait_time))
elem = WebDriverWait(driver, wait_time).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[name="btnSubmit"]')))
print('All Data from CMPM has been downloaded')
driver.quit()
# TODO: After we execute all our loading code, be sure to delete downloads/
def clr_dir():
if download_dir == '.html':
print(download_dir + 'is empty')
else:
print('download_dir is not empty! Will now attempt to delete all files')
for file in filelist:
os.remove(file)
print('All files have been removed from ' + download_dir)
#clr_dir() --- WORKS ---
#fetch_data() --- WORKS ---
store_data('/downloads')
现在这是我从控制台得到的错误:
C:\Users\daeyiele\Documents\NetBeansProjects\BudgetHome>python cmpm.py
Traceback (most recent call last):
File "cmpm.py", line 165, in <module>
store_data('/downloads')
File "cmpm.py", line 36, in store_data
database.connect()
File "C:\Users\daeyiele\AppData\Local\Programs\Python\Python36-32\lib\site-packa
ges\peewee.py", line 2439, in connect
self._state.set_connection(self._connect())
File "C:\Users\daeyiele\AppData\Local\Programs\Python\Python36-32\lib\site-packa
ges\peewee.py", line 2666, in _connect
**self.connect_params)
TypeError: 'threadlocals' is an invalid keyword argument for this function
C:\Users\daeyiele\Documents\NetBeansProjects\BudgetHome
有什么想法吗?
答案 0 :(得分:0)
从数据库类的定义中删除threadlocals
参数。 Peewee 3.x不支持它。回溯告诉你了。