JAVA:Jsoup无法抓取亚马逊网站

时间:2017-06-07 18:23:04

标签: java webdriver web-crawler jsoup amazon

我想使用jsoup抓取一些亚马逊普通页面而没有任何亚马逊api并检查每一个以查看是否有id标签" #noResultsTitle"

如果有" #noResultsTitle"在页面html源代码中,这意味着此网址下没有任何产品,否则网址将包含产品

我创建了一个测试文件SpreadSheetTest.java,它位于maven项目的测试文件夹下。

的pom.xml



<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.2</version>
</dependency>
&#13;
&#13;
&#13;

SpreadSheetTest.java

&#13;
&#13;
package com.ikeepstudying.google;

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;

public class SpreadSheetTest {

    private static final Logger LOGGER = LoggerFactory.getLogger(SpreadSheetTest.class);
    private String urls[] = {
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153342", //yes
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153373",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153502",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153533",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153564",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153748",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153762",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573153786",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603573164751",

        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912050561", // no
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912100396",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912100419",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912100457",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912103250",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912103298",
        "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=603912103335"
    };

    @Test
    public void main() {
        for (String url : urls) {
            File f = new File(url.replaceAll("[/?:%&=_.]", "-") + ".html");
            try {
                FileUtils.writeStringToFile(f, jsoupDriver(url), "UTF-8");
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    public String jsoupDriver(String url) {
        String html = "";
        try {
            Document document = Jsoup.connect(url)
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
                .header("Accept-Encoding", "gzip, deflate, sdch, br")
                .header("Accept-Language", "en-US,en;q=0.8")
                .header("Cache-Control", "max-age=0")
                .header("Connection", "keep-alive")
                .header("Host", "www.amazon.com")
                .header("Upgrade-Insecure-Requests", "1")
                .header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
                .header("Connection", "close")
                //.header("Cookie", "x-wl-uid=1uoGnlqAH9oxNf9qpYB9Nqm2yMxS+Lkot0XroYUO9mKX4Dk2mpL88QxaHBVCWNPiXbRjkfK7Omvg=; session-token=M0FBe7rpz9ha/n7yoBwDecc7bJgDUz3DYORipwWHvutdpFkqIx5+psDNyL5CFCw17+CG9MhAZsY4PzFxiQC/W601Z6o8WF7FnSekykvpkGsQ5U+0CtMyOdCqtIMGKwhw+UC3TosHoICLizQH95ZwMFACLU4+LOB+aiRZlwNFSmLp0s6HQ0lOhjpIq8HYVI48/n5WPifjUFQPc2/yr6EqXWL7XCmD16K6EgmQ40d1Wa2+mBSEpv7gPhOESB9rOjyd; p2ePopoverID_130-8754052-0126112=1; aws-target-static-id=1494716066803-1223; s_vn=1526252066969%26vn%3D1; __utma=194891197.1113632195.1494716449.1494716449.1494716449.1; __utmz=194891197.1494716449.1.1.utmccn=(referral)|utmcsr=google.com|utmcct=/|utmcmd=referral; aws-target-data=%7B%22support%22%3A%221%22%7D; aws-target-visitor-id=1494716066806-862211.28_84; s_fid=2C70DAEFED643E51-174766209130369D; s_dslv=1494716496725; s_nr=1494716496726-New; regStatus=pre-register; csm-hit=s-DHDVMQ1GH5WBJVC7TNM3|1496856733240; ubid-main=134-0745444-0565329; session-id-time=2082787201l; session-id=130-8754052-0126112")
                .timeout(5000)
                .get();

            html = document.outerHtml();
            LOGGER.info(!"".equals(document.select("#noResultsTitle").outerHtml()) ? "Yes" : "No");
        } catch (IllegalArgumentException e) {
            LOGGER.info("Jsoup Error: IllegalArgumentException: ");
            //html = "does not exist !";
        } catch (IOException e) {
            LOGGER.info("Jsoup Error: (FileUtils) IOException...");
        }

        return html;
    }
}
&#13;
&#13;
&#13;

以上标题来自amaozn网站,如下图:

enter image description here

正确的结果应该是第9个是,然后在控制台上没有7个。

问题是jsoup有随机问题,即使对于同一个url,有时是正确的,有时是错误的,我知道ghost webdriver(selenium)可以运行良好,但我不想使用它。我的错误是:

enter image description here

任何人都有任何好主意要解决它吗?

0 个答案:

没有答案