我用PhantomJS启动WebDriver:
def start_webdriver():
try:
driver = webdriver.PhantomJS('./phantomjs', desired_capabilities=dcap)
return driver
except WebDriverException, e:
print "Unable to load profile, retrying"
try:
driver = webdriver.PhantomJS('./phantomjs')
return driver
except WebDriverException, e:
print "Unable to load profile (again), aborting"
return None
def webdriver_retry(driver, url):
if url.startswith('//'):
url = 'http:' + url
driver.get(url)
try:
links = driver.find_elements_by_xpath(XPATH_MAPPING_HYPERLINKS['text'] % keyword)
print links
except InvalidSelectorException, e:
print '{} - Selenium failed to perform XPath query to extract links. Killing Webdriver and moving to next URL'.format(url)
print e.msg
driver.quit()
return
hrefs = [link.get_attribute('href') for link in links]
titles = [link.text for link in links]
titles_and_links = zip(titles, links)
print titles_and_links
for title, link in titles_and_links:
link.click()
if keyword in driver.page_source:
save_html(title, utf8_encode(driver.page_source))
其中:
XPATH_MAPPING_HYPERLINKS = {'href': "//a[contains(translate(normalize-space(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '%s')]/@href",}
keyword = 'manager'
当网址为http://mcand.co.uk/our-people/current-vacancies/时,XPath提取不返回任何内容(因为它没有包含文本'manager'的超链接),但我在ghostdriver日志中看到以下内容:
[INFO - 2016-02-15T20:03:22.587Z] GhostDriver - Main - running on port 53138
[INFO - 2016-02-15T20:03:23.577Z] Session [2b080a10-d41f-11e5-8f67-6f1883fdd5c4] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36","webSecurityEnabled":true}
[INFO - 2016-02-15T20:03:23.577Z] Session [2b080a10-d41f-11e5-8f67-6f1883fdd5c4] - page.customHeaders: - {}
[INFO - 2016-02-15T20:03:23.577Z] Session [2b080a10-d41f-11e5-8f67-6f1883fdd5c4] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.0.0","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"mac-10.9 (Mavericks)-64bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"},"phantomjs.page.settings.userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36"}
[INFO - 2016-02-15T20:03:23.577Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 2b080a10-d41f-11e5-8f67-6f1883fdd5c4
[ERROR - 2016-02-15T20:03:59.224Z] Session [2b080a10-d41f-11e5-8f67-6f1883fdd5c4] - page.onError - msg: TypeError: undefined is not a constructor (evaluating '$(document)')
:262 in error
[ERROR - 2016-02-15T20:03:59.225Z] Session [2b080a10-d41f-11e5-8f67-6f1883fdd5c4] - page.onError - stack:
global code (http://mcand.co.uk/wp-content/themes/mcc/js/build/main.min.js:1)
:262 in error
[ERROR - 2016-02-15T20:05:30.672Z] Session [2b080a10-d41f-11e5-8f67-6f1883fdd5c4] - page.onError - msg: TypeError: undefined is not a constructor (evaluating '$(document)')
:262 in error
[ERROR - 2016-02-15T20:05:30.691Z] Session [2b080a10-d41f-11e5-8f67-6f1883fdd5c4] - page.onError - stack:
global code (http://mcand.co.uk/wp-content/themes/mcc/js/build/main.min.js:1)
:262 in error
这是PhantomJS / ghostdriver问题吗?或者PhantomJS在页面的Javascript中检测到了错误?在其文本中是否存在与“manager”的链接,此错误是否会导致PhantomJS无法正确执行XPath?
作为一名Python开发人员,我的Javascript知识几乎为零。