我想使用jsoup抓取一些亚马逊普通页面而没有任何亚马逊api并检查每一个以查看是否有id标签" #noResultsTitle"
如果有" #noResultsTitle"在页面html源代码中,这意味着此网址下没有任何产品,否则网址将包含产品
我创建了一个测试文件SpreadSheetTest.java,它位于maven项目的测试文件夹下。
的pom.xml
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
&#13;
SpreadSheetTest.java
package com.ikeepstudying.google;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
public class SpreadSheetTest {
private static final Logger LOGGER = LoggerFactory.getLogger(SpreadSheetTest.class);
private String urls[] = {
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153342", //yes
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153373",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153502",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153533",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153564",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153748",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153762",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153786",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573164751",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912050561", // no
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912100396",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912100419",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912100457",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912103250",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912103298",
"https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912103335"
};
@Test
public void main() {
for (String url : urls) {
File f = new File(url.replaceAll("[/?:%&=_.]", "-") + ".html");
try {
FileUtils.writeStringToFile(f, jsoupDriver(url), "UTF-8");
} catch (IOException e) {
e.printStackTrace();
}
}
}
public String jsoupDriver(String url) {
String html = "";
try {
Document document = Jsoup.connect(url)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate, sdch, br")
.header("Accept-Language", "en-US,en;q=0.8")
.header("Cache-Control", "max-age=0")
.header("Connection", "keep-alive")
.header("Host", "www.amazon.com")
.header("Upgrade-Insecure-Requests", "1")
.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
.header("Connection", "close")
//.header("Cookie", "x-wl-uid=1uoGnlqAH9oxNf9qpYB9Nqm2yMxS+Lkot0XroYUO9mKX4Dk2mpL88QxaHBVCWNPiXbRjkfK7Omvg=; session-token=M0FBe7rpz9ha/n7yoBwDecc7bJgDUz3DYORipwWHvutdpFkqIx5+psDNyL5CFCw17+CG9MhAZsY4PzFxiQC/W601Z6o8WF7FnSekykvpkGsQ5U+0CtMyOdCqtIMGKwhw+UC3TosHoICLizQH95ZwMFACLU4+LOB+aiRZlwNFSmLp0s6HQ0lOhjpIq8HYVI48/n5WPifjUFQPc2/yr6EqXWL7XCmD16K6EgmQ40d1Wa2+mBSEpv7gPhOESB9rOjyd; p2ePopoverID_130-8754052-0126112=1; aws-target-static-id=1494716066803-1223; s_vn=1526252066969%26vn%3D1; __utma=194891197.1113632195.1494716449.1494716449.1494716449.1; __utmz=194891197.1494716449.1.1.utmccn=(referral)|utmcsr=google.com|utmcct=/|utmcmd=referral; aws-target-data=%7B%22support%22%3A%221%22%7D; aws-target-visitor-id=1494716066806-862211.28_84; s_fid=2C70DAEFED643E51-174766209130369D; s_dslv=1494716496725; s_nr=1494716496726-New; regStatus=pre-register; csm-hit=s-DHDVMQ1GH5WBJVC7TNM3|1496856733240; ubid-main=134-0745444-0565329; session-id-time=2082787201l; session-id=130-8754052-0126112")
.timeout(5000)
.get();
html = document.outerHtml();
LOGGER.info(!"".equals(document.select("#noResultsTitle").outerHtml()) ? "Yes" : "No");
} catch (IllegalArgumentException e) {
LOGGER.info("Jsoup Error: IllegalArgumentException: ");
//html = "does not exist !";
} catch (IOException e) {
LOGGER.info("Jsoup Error: (FileUtils) IOException...");
}
return html;
}
}
&#13;
以上标题来自amaozn网站,如下图:
正确的结果应该是第9个是,然后在控制台上没有7个。
问题是jsoup有随机问题,即使对于同一个url,有时是正确的,有时是错误的,我知道ghost webdriver(selenium)可以运行良好,但我不想使用它。我的错误是:
任何人都有任何好主意要解决它吗?