在过去的一两天里,我一直在研究这个Python脚本,当我使用Firefox webdriver时,一切正常,但当我转而使用像PhantomJS这样的无头浏览器时,它就失败了{ {1}}由于setNumber = parseSetNumber(setName[0])
为空而导致错误Error: list index out of range
。
setName
之前的行只在使用PhantomJS webdriver时没有返回任何内容,如果使用Firefox webdriver它会返回一个很好的值。
只有当我将webdriver从Firefox切换到PhantomJS时才会出现错误。我使用PhantomJS,因为脚本在linux服务器上运行。
setName = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/text()")
更新
看起来这些只是不能与PhantomJS一起使用的两行,它们都返回一个空值。
import time
import os.path
import lxml.html as LH
import re
import sys
from selenium import webdriver
from random import randint
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PARAMS = sys.argv
URL = PARAMS[1]
BASEURL = URL[:URL.rfind('/')+1]
# Parses the set name for the set number
def parseSetNumber(string):
string = string.split(' ')
stringLength = len(string)
string = string[(stringLength - 1)]
if string.replace('.','').isdigit():
return string
else:
return ""
# Returns set reference for this site
def parseRefId(string):
string = string.split('_')
return str(string[2])
try:
PAGE_NUMBER = 1
#--------------------------------------------------
## Get initial page
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
#--------------------------------------------------
## Get page count
# Give page time to load
time.sleep(2)
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_COUNT_RAW = PAGE_RAW.xpath("//div[contains(@class, 'pageControlMenu')]/div/ul/li")
PAGE_COUNT = len(PAGE_COUNT_RAW) - 2
#--------------------------------------------------
## Get page if its not page one
while PAGE_NUMBER <= PAGE_COUNT:
#--------------------------------------------------
## Create empty file
FILE_NAME = PARAMS[3] + 'json/' + time.strftime("%Y%m%d%H") + '_' + str(PARAMS[2]) + '_' + str(PAGE_NUMBER) + '.json'
#--------------------------------------------------
## Create JSON file if it doesnt exist
if os.path.exists(FILE_NAME)==False:
JSON_FILE = open(FILE_NAME, "a+", encoding="utf-8")
else:
JSON_FILE = open(FILE_NAME, "w", encoding="utf-8")
JSON_FILE.write("{")
#--------------------------------------------------
# Click page for next page if not page 1
if PAGE_NUMBER > 1:
index = 0
for atag in PAGE_COUNT_RAW:
if index == PAGE_NUMBER:
elements = driver.find_elements_by_xpath("//div[contains(@class, 'pageControlMenu')]/div/ul/li")
if elements:
element = elements[index].find_elements_by_xpath("./a")
if element:
element[0].click()
time.sleep(randint(3,5))
index += 1
#--------------------------------------------------
## Remove survey box if it pops up and log
try:
surveyBox = driver.find_element_by_link_text("No, thanks")
if surveyBox:
surveyBox.click()
print("Store[" + str(PARAMS[2]) + "]: Survey box found on page - " + str(PAGE_NUMBER))
except:
print("Store[" + str(PARAMS[2]) + "]: No survey box on page - " + str(PAGE_NUMBER))
#--------------------------------------------------
## Proces page
# If page is greater then 1 then get the page source of the new page.
if PAGE_NUMBER > 1:
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_RAW = PAGE_RAW.xpath("//div[contains(@class, 'estore_product_container')]")
index = 0
size = len(PAGE_RAW)
for atag in PAGE_RAW:
if PAGE_NUMBER > 1 and index == 0:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a")))
setStore = PARAMS[2]
setName = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/text()")
setNumber = parseSetNumber(setName[0])
setPrice = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_price')]/text()")
setLink = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/@href")
setRef = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_price')]/@id")
if setRef:
setRef = parseRefId(setRef[0])
if re.search('[0-9\.]+', setPrice[0]) is not None:
JSON_FILE.write("\"" + str(index) + "\":{\"store\":\"" + str(setStore) + "\",\"name\":\"" + str(setName[0]) + "\",\"number\":\"" + str(setNumber) + "\",\"price\":\"" + re.search('[0-9\.]+', setPrice[0]).group() + "\",\"ref\":\"" + str(setRef) + "\",\"link\":\"" + str(setLink[0]) + "\"}")
if index+1 < size:
JSON_FILE.write(",")
index += 1
#--------------------------------------------------
## Close JSON file
JSON_FILE.write("}")
JSON_FILE.close()
#--------------------------------------------------
## Increment page number
PAGE_NUMBER += 1
#--------------------------------------------------
#--------------------------------------------------
## Close webdriver
driver.quit()
#--------------------------------------------------
except Exception as e:
print('Error: ' + str(e.args[0]))
# Remove gecodriver.log file
GHOSTDRIVER_FILE = str(PARAMS[3]) + 'jobs/ghostdriver.log'
if os.path.exists(GHOSTDRIVER_FILE)==True:
os.remove(GHOSTDRIVER_FILE)
答案 0 :(得分:0)
好的,看起来我已经解决了这个问题,我在使用PhantomJS时必须为webdriver添加set_windows_size
选项。
最初:
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
解决方案:
driver = webdriver.PhantomJS()
driver.set_window_size(1024, 768)
driver.get(PARAMS[1])
现在,PhantomJS webdriver的运行方式与Firefox webdriver的工作方式相同。