I am trying to loop through all pages in glassdoor and scrape data. This happens inside a function and after appending I do not want to return anything therefore I gave return None and it is giving me this error as stated in title.
my code:
# list to store scraped data
company = []
location = []
job_desc = []
position = []
# Initialize browser to use chrome and show its process.
executable_path = {'executable_path': "chromedriver.exe"}
browser = Browser('chrome', **executable_path, headless=False)
# glassdoor url
url = "https://www.glassdoor.ca/index.htm"
def scrape_current_page():
# Getting html of first page
html = browser.html
soup = BeautifulSoup(html, "html.parser")
jobs = soup.find_all("li", class_="jl")
for job in jobs:
# Store all info into a list
position.append(job.find("div", class_="jobTitle").a.text)
# ex: Tommy - Singapore
comp_loc = job.find("div", class_="empLoc").div.text
comp, loc = comp_loc.split("–")
# print(comp)
company.append(comp.strip())
location.append(loc.strip())
# ------------- Scrape Job descriptions within a page -----------
# job description is in another html, therefore retrieve it once again after
# clicking.
browser.click_link_by_href(job.find("a", class_="jobLink")["href"])
html = browser.html
soup = BeautifulSoup(html, "html.parser")
job_desc.append(soup.find("div", class_="desc").text)
return None
def scrape_all():
# grab new html, grab page control elements
html = browser.html
soup = BeautifulSoup(html, "html.parser")
result = soup.find("div", class_="pagingControls").ul
pages = result.find_all("li")
# Scrape first page before going to next
scrape_current_page()
for page in pages:
# run if <a> exists since un-clickable do not have <a> skipping < and pg1
if page.a:
# within <a> tag click except next button
if not page.find("li", class_="Next"):
try:
# Click to goto next page, then scrape it.
browser.click_link_by_href(page.a['href'])
# --------- call scrape data function here ---------
scrape_current_page()
except:
print("This is the last page")
# No need to return since we appened all data into list
return None
@app.route("/")
def home():
return render_template("index.html")
@app.route("/scrape/<input>")
def test(input):
# With initialized browser, lets visit glassdoor website
browser.visit(url)
title, loc = input.split("!")
print(title, f'location = {loc}')
# Find where we should fill using splinter then fill it up
job_type = browser.find_by_id("KeywordSearch")
print(job_type)
job_type.fill(title)
location = browser.find_by_id("LocationSearch")
location.fill(loc)
# Clicking button
browser.find_by_id("HeroSearchButton").click()
scrape_all()
print(job_desc)
When I input values into search bar and submit it will run @app.rout("/scrape/") which opens browser and fills in appropriately then calls scrape_all function to scrape data.
Everything works fine, I can see splinter going through each pages and scraping data however at the end it end with an error. I just want to append data into lists created in beginning of code. Any help?