我正在尝试访问不同页面,在某些字段中插入名称/电子邮件,然后按一个按钮提交这些字段。
现在,我发现了一种使用webdriver在所有页面上匹配电子邮件/名称的方法,即使它们的html结构不同。我使用以下代码:
import logging
from selenium.common.exceptions import ErrorInResponseException, \
WebDriverException
from selenium.webdriver.common.keys import Keys
from pyvirtualdisplay import Display
from selenium import webdriver
import lxml.html
import urlparse
import time
import re
def subscribe(email, name):
display = Display(visible=0, size=(800, 600))
dom = lxml.html.parse('http://muncheye.com')
url = dom.docinfo.URL
driver = webdriver.Chrome()
failed_urls = []
i = 0
to_visit_urls = dom.xpath('//div[@id="right-column"]//a/@href')
print(len(to_visit_urls))
"""
Visit each url. Check to be alive. Search form.
"""
for link in to_visit_urls:
not_found = False
name_required = True
email_required = True
button_required = True
dom1 = lxml.html.parse(urlparse.urljoin(url, link))
submit_url = dom1.xpath(
'//div[@class="product_info"]//table//tr[7]//td[2]//a/@href')[0]
if re.match('https?://(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.['
'^\s]+\.[^\s]{2,}', submit_url):
time.sleep(10)
try:
driver.get(submit_url)
try:
name_box = driver.find_element_by_xpath(
"//input[@*[contains(translate(., "
"'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz'), 'name')]]")
name_box.click()
name_box.clear()
name_box.send_keys(email)
except Exception:
not_found = True
try:
email_box = driver.find_element_by_xpath(
"//input[@*[contains(translate(., "
"'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz'), 'email')]]")
email_box.click()
email_box.clear()
email_box.send_keys(email)
except Exception:
not_found = True
if not_found:
i += 1
print "here" + " = " + str(i) + " link = " + str(submit_url)
for element in driver.find_elements_by_xpath(
"//input[@type='text']"):
if name_required:
try:
name_box = element.find_element_by_xpath(
".[@*[contains(translate(., "
"'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz'), 'name')]]")
name_box.click()
name_box.clear()
name_box.send_keys(name)
name_required = False
continue
except Exception:
pass
if email_required:
try:
email_box = element.find_element_by_xpath(
".[@*[contains(translate(., "
"'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz'), 'email')]]")
email_box.click()
email_box.clear()
email_box.send_keys(email)
email_required = False
break
except Exception:
pass
if (not name_required) and (not email_required) and (
not button_required):
break
for element1 in driver.find_elements_by_xpath(
"//*[@type[translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz') = 'submit']]["
"preceding::*[@name[translate(., "
"'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz') ='email' or translate("
"., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz') ='name']]]"):
if button_required:
try:
button = element1.find_element_by_xpath(
"//*[@type[translate(., "
"'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz') = 'submit']]["
"preceding::*[@name[translate(., "
"'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz') ='email' or "
"translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
"'abcdefghijklmnopqrstuvwxyz') "
"='name']]]").click()
element1.click()
element1.send_keys(Keys.ENTER)
element1 = False
continue
except Exception:
try:
element1.find_element_by_xpath(
"//*[@name='email' or "
"@name='name']//following::*["
"@type='submit']/a").click()
element1.click()
element1.send_keys(Keys.ENTER)
button_required = False
except Exception:
pass
except WebDriverException:
logging.exception('Chrome crashed')
driver.close()
driver = webdriver.Chrome()
to_visit_urls.append(link)
except Exception as e:
logging.exception("Fail here:{0}".format(submit_url))
failed_urls.append(submit_url)
pass # this 'pass' is here because when the script passed
# from link 33, it gives me fail on all of them
time.sleep(5)
print button_required
return failed_urls
print subscribe('hfbfsdfsdf@freeletter.me', 'hfbfsdfsdf@freeletter.me')
现在,我不知道问题是来自源代码还是webdriver / xpath,但是当按钮尝试提交这些字段时我不认为它在页面上找到,因为我只得到5 /来自100个可用链接的6封电子邮件。
现在,问题是:任何人都可以给我一个更好的xpath表达式,如果页面彼此不同,它们能够按下按钮/填充名称/电子邮件字段吗?
答案 0 :(得分:-1)
更合适的方法是根据每个页面以某种特定方式查找每对元素。这样做直到你真正找到元素,然后用它们做东西。我不知道HTML的样子,所以我无法向您展示实际的代码。