抓取amazon.com

时间:2016-11-06 10:35:16

标签: java web-crawler amazon

我正在爬行亚马逊产品和原则它正常。

我从这个很好的教程中有三个类:

http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/

我将文件添加到以下代码(类蜘蛛):

import java.io.FileNotFoundException;
import java.util.*;


public class Spider {
    public static final int MAX_PAGES_TO_SEARCH = 10000;
    private Set<String> pagesVisited = new HashSet<String>();
    private List<String> pagesToVisit = new LinkedList<String>();

    public void search(String url) {
        while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) {
        String currentUrl;
        SpiderLeg leg = new SpiderLeg();
        if (this.pagesToVisit.isEmpty()) {
            //System.out.println("abc");
            currentUrl = url;
            this.pagesVisited.add(url);
        } else {
            //System.out.println("def");
            currentUrl = this.nextUrl();
        }
        try {
            Thread.sleep(10000);
            leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
        } catch (FileNotFoundException e) {
            System.out.println("Oops, FileNotFoundException caught");
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        this.pagesToVisit.addAll(leg.getLinks());
        //System.out.println("Test");
    }
    System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
    SpiderLeg leg = new SpiderLeg();
    leg.calcAdjMatrix();
    for (int i = 0; i < leg.adjMatrix.length; i++) {
        System.out.println(Arrays.toString(leg.adjMatrix[i]));

    }

}

private String nextUrl() {
    String nextUrl;
    do {
        if (this.pagesToVisit.isEmpty()){
            return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D";
        }
        nextUrl = this.pagesToVisit.remove(0);
    } while (this.pagesVisited.contains(nextUrl));
    this.pagesVisited.add(nextUrl);
    return nextUrl;
}
}

类SpiderLeg:

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.*;

public class SpiderLeg {
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser.
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
    private static List<String> links = new LinkedList<String>();
    private static String graphLink;
    private Document htmlDocument;
    private static double counter = 0;
    static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>();
    static int[][] adjMatrix;
    static List<String> mapping;

    public boolean crawl(String url) throws FileNotFoundException {
        if (url.isEmpty()) {
        return false;
    }
    try{
        Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT);
        Document htmlDocument = connection.get();
        this.htmlDocument = htmlDocument;
        if(connection.response().statusCode() == 200){
            // 200 is the HTTP OK status code
            // indicating that everything is great.
            counter++;
            double progress;
            progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100;
            System.out.println("\n**Visiting** Received web page at " + url);
            System.out.println("\n**Progress** " + progress + "%");
        }
        if(!connection.response().contentType().contains("text/html")) {
            System.out.println("**Failure** Retrieved something other than HTML");
            return false;
        }

        //Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]");
        Elements linksOnPage = htmlDocument.select("a[href*=/dp/]");
        Elements salesRank = htmlDocument.select("span.zg_hrsr_rank");
        Elements category = htmlDocument.select("span.zg_hrsr_ladder a");

        String categoryString = category.html();
        String salesRankString = salesRank.html();
        salesRankString = salesRankString.replace("\n", " ");
        categoryString = categoryString.replace("\n", " ");
        //System.out.println(categoryString);
        System.out.println("Found (" + linksOnPage.size() + ") links");

        PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true));
        StringBuilder sb = new StringBuilder();

        int beginIndex = url.indexOf(".de/");
        int endIndex = url.indexOf("/dp");
        String title = url.substring(beginIndex+4,endIndex);

        if(!adjMap.containsKey(title)){
            if(categoryString.contains("Horror")){
                adjMap.put(title, new HashSet<String>());
                sb.append(title);
                sb.append(',');
                sb.append(salesRankString);
                sb.append(',');
                sb.append(categoryString);
                sb.append(',');
                for(Element link : linksOnPage){
                    String graphLink = link.attr("abs:href");
                    if(!graphLink.contains("one-click")){
                        if(!graphLink.contains("Kindle")){
                            if(!graphLink.contains("unsticky")){
                                this.links.add(graphLink);
                                //adjMap.get(url).add(graphLink);
                                adjMap.get(title).add(cutTitle(graphLink));
                                sb.append(graphLink);
                                sb.append(',');
                            }
                        }
                    }
                }
            sb.append('\n');
            pw.write(sb.toString());
            pw.close();
            }

        }


        System.out.println("done!");
        return true;
    }
    catch(IOException ioe) {
        // We were not successful in our HTTP request
        System.out.println("Error in out HTTP request " + ioe);
        return false;
    }
    }

public static void calcAdjMatrix(){
    Set<String> allMyURLs = new HashSet(adjMap.keySet());
    for(String s: adjMap.keySet()){
        allMyURLs.addAll(adjMap.get(s));
        System.out.println(s + "\t" + adjMap.get(s));
    }

    int dim = allMyURLs.size();
    adjMatrix = new int[dim][dim];
    List<String> nodes_list = new ArrayList<>();
    for(String s: allMyURLs){
        nodes_list.add(s);
    }

    for(String s: nodes_list){
        Set<String> outEdges = adjMap.get(s);
        int i = nodes_list.indexOf(s);
        if(outEdges != null){
            for(String s1: outEdges){
                int j = nodes_list.indexOf(s1);
                adjMatrix[i][j] = 1;
            }
        }

    }

}

public String cutTitle(String url) throws FileNotFoundException{
    int beginIndex = url.indexOf(".de/");
    int endIndex = url.indexOf("/dp");
    String title;
    if(url.contains(".de") && url.contains("/dp")){
        title = url.substring(beginIndex+4,endIndex);
    }else{
        title = "wrong url";
    }

    return title;
}
public boolean searchForWord(String searchWord) {

    if(this.htmlDocument == null) {
        System.out.println("ERROR! Call crawl() before performing analysis on the document");
        return false;
    }
    System.out.println("Searching for the word " + searchWord + "...");
    String bodyText = this.htmlDocument.body().text();
    return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}


public List<String> getLinks(){
    return this.links;
}

}

类SpiderTest:

public class SpiderTest {
    public static void main(String[] args) {
        Spider spider = new Spider();
        spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6");
    }
}

现在的问题是,在我认为100个网址之后,亚马逊禁止我从服务器上运行。该程序不再找到URL。

有没有人知道如何解决这个问题?

3 个答案:

答案 0 :(得分:3)

那么,不要粗鲁,然后爬行。

检查他们的robots.txtwiki),了解他们允许您执行的操作。如果你去了他们不想让你去的地方,如果他们禁止你,不要感到惊讶。

答案 1 :(得分:2)

当您尝试抓取不希望被抓取的大型网站时,此问题非常常见。他们基本上会阻止你一段时间,以防止他们的数据被抓取或被盗。

话虽如此,您有两个选择,要么从不同的IP /服务器发出每个请求,这将使您的请求看起来合法并避免禁令,或者采用最简单的方法来使用这样做的服务对你而言。

我已经完成了两个,第一个是复杂的,需要时间并需要维护(你必须建立一个服务器网络),第二个选项通常不是免费的,但实施起来很快,并保证你的所有请求都将总是返回数据,你不会被禁止。

互联网上有一些服务可以做到这一点。我过去使用proxycrawl(也有一个免费等级),效果非常好。他们有一个可以调用的API,您只能使用相同的代码,只需更改您调用的网址即可。

这是amazon的一个例子:

GET https://api.proxycrawl.com?token=yourtoken&url=https://amazon.com

你会得到一个回应,即使你每秒爬1000页,你也永远不会被禁止,因为你会调用那个代理而不是为你做所有的魔法。

我希望它有所帮助:)

答案 2 :(得分:0)

您可以尝试使用代理服务器来防止被阻止。有提供工作代理的服务。我有很好的使用https://gimmeproxy.com的经验,它专门有代理支持亚马逊。

要让代理与Amazon合作,您只需提出以下请求:

https://gimmeproxy.com/api/getProxy?api_key=your_api_key&websites=amazon

您将获得所有代理数据的JSON响应,您可以在以后根据需要使用这些数据:

{
  "supportsHttps": true,
  "protocol": "socks5",
  "ip": "116.182.122.182",
  "port": "1915",
  "get": true,
  "post": true,
  "cookies": true,
  "referer": true,
  "user-agent": true,
  "anonymityLevel": 1,
  "websites": {
    "example": true,
    "google": false,
    "amazon": true
  },
  "country": "BR",
  "tsChecked": 1517952910,
  "curl": "socks5://116.182.122.182:1915",
  "ipPort": "116.182.122.182:1915",
  "type": "socks5",
  "speed": 37.78,
  "otherProtocols": {}
}