我试图从此网页中检索所有字段:
https://www.jobsbank.gov.sg/ICMSPortal/portlets/JobBankHandler/SearchDetail.do?id=JOB-2016-0008786
然而,当我运行我的代码时,它似乎甚至没有检索HTML。我想知道它是什么让它无法检索HTML的页面(我使用PhantomJS.exe无头浏览器但不应该有所作为),这与iframe有什么关系吗?如果是这样,我如何使用硒来解决这个问题?
我已经使用硒与幻影成功地从http://www.indeed.com/resumes/-/in-Singapore中取出,但是相同的方法似乎对上述网站没有效果。请参阅下面的代码
import re
import math
import time
import requests
from lxml import html
import selenium
from selenium import webdriver
def getPageEnd(url):
payload = {
"{actionForm.checkValidRequest}": "YES",
"{actionForm.recordsPerPage}": "20",
"{actionForm.sortBy}": "1",
"{actionForm.searchType}": "Quick Search",
"{actionForm.currentPageNumber}": "1"
}
r = requests.post(url, data=payload)
tree = html.fromstring(r.text)
page_list = tree.xpath('//div[@class="searchetails"]/p/'
'span[@style="color: #b41b84;"]/text()')
page_list = re.findall('\d+', page_list[0])
page_end = page_list[-1]
page_end = int(page_end)/100.0
page_end = math.ceil(page_end) #rounding up
return int(page_end)
def jobScrape(url, pagenum):
job_link_url_prepend = "__https://www.jobsbank.gov.sg"
payload = {
"{actionForm.checkValidRequest}": "YES",
"{actionForm.recordsPerPage}": "20",
"{actionForm.sortBy}": "1",
"{actionForm.searchType}": "Quick Search",
"{actionForm.currentPageNumber}": "%s" % pagenum,
}
while True:
try:
r = requests.post(url, data=payload)
except requests.exceptions.ConnectionError as e:
print("Exception ConnectionError was caught, retrying requests...")
time.sleep(5)
else:
break
tree = html.fromstring(r.text)
cur_page_job_links = [job_link_url_prepend + i for i in
tree.xpath('//td[@class="jobDesActive"]/a/@href')]
print("Done scraping page %s" % pagenum)
return cur_page_job_links
def main():
driver = webdriver.PhantomJS(executable_path=r'E:\desktop\phantomjs.exe')
driver.set_window_size(1120, 550)
url = "https://www.jobsbank.gov.sg/ICMSPortal/portlets/JobBankHandler/SearchResult3.do"
page_start = 1
page_end = getPageEnd(url)
for pagenum in range(page_start, 2):
cur_page_job_links = jobScrape(url, str(pagenum))
with open("link.txt", 'a') as f:
for link in cur_page_job_links:
f.write("%s \n" % link)
for link in cur_page_job_links:
url = "__https://" + link
driver.get(url)
htmltext = driver.page_source
print htmltext
if __name__ == "__main__":
main()
我认为它可能与想要检索JobDescription框架有关,如何使用window.document.getElementById(" frameJobDescription")之类的命令来获得我想要的东西?