我有一个网站要抓取,其中包含4个级别的嵌套页面。
Level 1
--->Level 2
--->Level 3
--->Level 4
--->Level 3
--->Level 4
--->Level 2
因此,我必须来回访问以访问每个Level 4,每个Level 3,每个Level 2,每个Level 1。
因此,我创建了嵌套循环
List<WebElement> chapters = driver.findElements(By.xpath("/html[1]/body[1]/div[2]/div[1]/div[4]/div[3]/div[1]/div[1]/table[1]/tbody[1]/tr[*]/td[3]/a"));
for(WebElement chapter: chapters)
{
String chapter_name = chapter.getText();
String chapter_url = chapter.getAttribute("href");
System.out.println("CHAPTER : " + chapter_name + "URL : " + chapter_url);
driver.get(chapter_url);
List<WebElement> topics = driver.findElements(By.xpath("/html[1]/body[1]/div[2]/div[1]/div[4]/div[3]/div[1]/div[1]/table[1]/tbody[1]/tr[*]/td[3]/a"));
for(WebElement topic: topics)
{
String topic_name = topic.getText();
String topic_url = topic.getAttribute("href");
System.out.println("\tTOPIC : " + topic_name + "URL : " + topic_url);
driver.get(topic_url);
List<WebElement> sub_topics = driver.findElements(By.xpath("/html[1]/body[1]/div[2]/div[1]/div[4]/div[3]/div[1]/div[1]/table[1]/tbody[1]/tr[*]/td[3]/a"));
for(WebElement sub_topic : sub_topics)
{
String sub_topic_name = sub_topic.getText();
String sub_topic_url = sub_topic.getAttribute("href");
System.out.println("\t\tSUBTOPIC : " + sub_topic_name + "URL : " + sub_topic_url);
driver.get(sub_topic_url);
List<WebElement> problems = driver.findElements(By.xpath("/html[1]/body[1]/div[2]/div[1]/div[4]/div[3]/div[1]/div[1]/table[1]/tbody[1]/tr[*]/td[3]/a"));
for(WebElement problem : problems)
{
System.out.println("\t\t\t"+problem.getText());
}
driver.navigate().back();
}
driver.navigate().back();
}
driver.navigate().back();
}
但是我得到以下异常:
Exception in thread "main" org.openqa.selenium.NoSuchElementException: Web element reference not seen before: dcbb0aef-d165-4450-964c-535fc4577f69
For documentation on this error, please visit: http://seleniumhq.org/exceptions/no_such_element.html
Build info: version: '3.14.0', revision: 'aacccce0', time: '2018-08-02T20:05:20.749Z'
System info: host: 'workstation', ip: '127.0.1.1', os.name: 'Linux', os.arch: 'amd64', os.version: '4.15.0-39-generic', java.version: '1.8.0_181'
Driver info: org.openqa.selenium.firefox.FirefoxDriver
Capabilities {acceptInsecureCerts: true, browserName: firefox, browserVersion: 63.0.3, javascriptEnabled: true, moz:accessibilityChecks: false, moz:geckodriverVersion: 0.23.0, moz:headless: false, moz:processID: 13651, moz:profile: /tmp/rust_mozprofile.gx46rW..., moz:useNonSpecCompliantPointerOrigin: false, moz:webdriverClick: true, pageLoadStrategy: normal, platform: LINUX, platformName: LINUX, platformVersion: 4.15.0-39-generic, rotatable: false, setWindowRect: true, timeouts: {implicit: 0, pageLoad: 300000, script: 30000}, unhandledPromptBehavior: dismiss and notify}
Session ID: 55d3e16e-5920-414d-b047-a24f5483a2c7
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.openqa.selenium.remote.http.W3CHttpResponseCodec.createException(W3CHttpResponseCodec.java:187)
at org.openqa.selenium.remote.http.W3CHttpResponseCodec.decode(W3CHttpResponseCodec.java:122)
at org.openqa.selenium.remote.http.W3CHttpResponseCodec.decode(W3CHttpResponseCodec.java:49)
at org.openqa.selenium.remote.HttpCommandExecutor.execute(HttpCommandExecutor.java:158)
at org.openqa.selenium.remote.service.DriverCommandExecutor.execute(DriverCommandExecutor.java:83)
at org.openqa.selenium.remote.RemoteWebDriver.execute(RemoteWebDriver.java:548)
at org.openqa.selenium.remote.RemoteWebElement.execute(RemoteWebElement.java:276)
at org.openqa.selenium.remote.RemoteWebElement.getText(RemoteWebElement.java:160)
at firstTest.Getlinks.main(Getlinks.java:52)
这也许是因为向后导航可能会刷新并且状态丢失。在这种情况下,解决方案/最佳做法是什么?
答案 0 :(得分:0)
这肯定是在回头。每次返回时,您都会获得一个新页面,并且以前存储的元素不再可交互。我注意到您所有的xPathes都将采用链接(而且,它们是相似的),因此这是我修改可能解决您的问题的代码的方式:
private static final By XPATH = By.xpath("/html[1]/body[1]/div[2]/div[1]/div[4]/div[3]/div[1]/div[1]/table[1]/tbody[1]/tr[*]/td[3]/a");
public void testMethod() {
List<WebElement> chapters = driver.findElements(XPATH);
List<String> chapterTexts = getTextsFromElements(chapters);
scanChapters(chapterTexts);
}
private List<String> getTextsFromElements(List<WebElement> els) {
List<String> texts = new ArrayList<>();
for (WebElement el : els) {
texts.add(el.getText());
}
return texts;
}
private void scanChapters(List<String> chapterTexts) {
for (String chapterText : chapterTexts) {
WebElement chapter = driver.findElement(By.linkText((chapterText)));
String chapter_url = chapter.getAttribute("href");
System.out.println("CHAPTER : " + chapterText + "URL : " + chapter_url);
driver.get(chapter_url);
List<WebElement> topics = driver.findElements(XPATH);
List<String> topicTexts = getTextsFromElements(topics);
scanTopics(topicTexts);
driver.navigate().back();
}
}
private void scanTopics(List<String> topicTexts) {
for (String topicText : topicTexts) {
WebElement topic = driver.findElement(By.linkText((topicText)));
String topic_url = topic.getAttribute("href");
System.out.println("\tTOPIC : " + topicText + "URL : " + topic_url);
driver.get(topic_url);
List<WebElement> sub_topics = driver.findElements(XPATH);
List<String> subTopicTexts = getTextsFromElements(sub_topics);
scanSubTopics(subTopicTexts);
driver.navigate().back();
}
}
private void scanSubTopics(List<String> subTopicTexts) {
for (String subTopicText : subTopicTexts) {
WebElement subTopic = driver.findElement(By.linkText((subTopicText)));
String sub_topic_url = subTopic.getAttribute("href");
System.out.println("\t\tSUBTOPIC : " + subTopicText + "URL : " + sub_topic_url);
driver.get(sub_topic_url);
List<WebElement> problems = driver.findElements(XPATH);
for (WebElement problem : problems) {
System.out.println("\t\t\t" + problem.getText());
}
driver.navigate().back();
}
}