我有一个属于类的网络爬虫:
class Request:
logger = logging.getLogger('django.project.requests')
selenium_retries = 0
def __init__(self,url):
self.url = url
def run_sel(self,class_name):
try:
software_names = [SoftwareName.CHROME.value]
operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]
user_agent_rotator = UserAgent(sofware_names=software_names,operating_systems=operating_systems,limit=100)
# set chrome_options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_path = chromedriver_path()
driver = webdriver.Chrome(executable_path=chrome_path, chrome_options=chrome_options)
# get the site
driver.get(self.url)
time_to_wait = 90
try:
WebDriverWait(driver, time_to_wait).until(
EC.presence_of_element_located((By.CLASS_NAME,class_name)))
finally:
driver.maximize_window()
# scroll to the bottom
old_position = 0
new_position = None
while new_position != old_position:
# Get old scroll position
old_position = driver.execute_script(
("return (window.pageYOffset !== undefined) ?"
" window.pageYOffset : (document.documentElement ||"
" document.body.parentNode || document.body);"))
# Sleep and Scroll
time.sleep(1)
driver.execute_script((
"var scrollingElement = (document.scrollingElement ||"
" document.body);scrollingElement.scrollTop ="
" scrollingElement.scrollHeight;"))
# Get new position
new_position = driver.execute_script(
("return (window.pageYOffset !== undefined) ?"
" window.pageYOffset : (document.documentElement ||"
" document.body.parentNode || document.body);"))
# construct dictionary --- {link: , name: , fields: }
# save individual's links
elements = driver.find_elements_by_css_selector('a._32mo')
contents = driver.find_elements_by_css_selector('div._pac')
link = []
name = []
field = []
for i in elements:
link.append(i.get_attribute('href'))
name.append(i.get_attribute('text'))
for e in contents:
field.append(e.text)
# create dictionary with three lists
output = [{'scraped_name': name, 'link': link, 'text_string': field} for name, link, field in zip(name, link, field)]
driver.close()
# update_ind_scrape_status(i,ts,row_id)
return output
except (TimeoutException, WebDriverException):
# self.logger.error(traceback.format_exc())
time.sleep(6)
# self.selenium_retries += 1
# self.logger.info('Selenium retry # ' + str(selenium_retries))
return self.run_sel(class_name)
我在类外定义了另一个函数,该函数将抓取状态推送到Google大查询:
def update_scrape_status(i,ts,s):
# pushes scrape status to google big query table
# parameters:
i : scraped successfully, 1 or 0
ts : timestamp scraped
s : the string parameter that is pushed
我的问题是,如何将该函数嵌入到Request类中?另外,我需要在run_sel
之前的driver.close()
下运行它,但是我有点困惑,因为s
中的update_scrape_status
参数是由Request类之外的函数定义的。我是否也应该将它们嵌入到Request类中?
参数s
定义为:
s = read_joined_table()[0]['string_name']
def read_joined_table():
# function that calls a sample data from google big query
我对OOP不太熟悉,但是我想将scrape status函数嵌入到此类中,而不是将类更改为函数。