我正在爬行亚马逊产品和原则它正常。
我从这个很好的教程中有三个类:
http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/
我将文件添加到以下代码(类蜘蛛):
import java.io.FileNotFoundException;
import java.util.*;
public class Spider {
public static final int MAX_PAGES_TO_SEARCH = 10000;
private Set<String> pagesVisited = new HashSet<String>();
private List<String> pagesToVisit = new LinkedList<String>();
public void search(String url) {
while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) {
String currentUrl;
SpiderLeg leg = new SpiderLeg();
if (this.pagesToVisit.isEmpty()) {
//System.out.println("abc");
currentUrl = url;
this.pagesVisited.add(url);
} else {
//System.out.println("def");
currentUrl = this.nextUrl();
}
try {
Thread.sleep(10000);
leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
} catch (FileNotFoundException e) {
System.out.println("Oops, FileNotFoundException caught");
} catch (InterruptedException e) {
e.printStackTrace();
}
this.pagesToVisit.addAll(leg.getLinks());
//System.out.println("Test");
}
System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
SpiderLeg leg = new SpiderLeg();
leg.calcAdjMatrix();
for (int i = 0; i < leg.adjMatrix.length; i++) {
System.out.println(Arrays.toString(leg.adjMatrix[i]));
}
}
private String nextUrl() {
String nextUrl;
do {
if (this.pagesToVisit.isEmpty()){
return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D";
}
nextUrl = this.pagesToVisit.remove(0);
} while (this.pagesVisited.contains(nextUrl));
this.pagesVisited.add(nextUrl);
return nextUrl;
}
}
类SpiderLeg:
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.*;
public class SpiderLeg {
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser.
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
private static List<String> links = new LinkedList<String>();
private static String graphLink;
private Document htmlDocument;
private static double counter = 0;
static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>();
static int[][] adjMatrix;
static List<String> mapping;
public boolean crawl(String url) throws FileNotFoundException {
if (url.isEmpty()) {
return false;
}
try{
Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT);
Document htmlDocument = connection.get();
this.htmlDocument = htmlDocument;
if(connection.response().statusCode() == 200){
// 200 is the HTTP OK status code
// indicating that everything is great.
counter++;
double progress;
progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100;
System.out.println("\n**Visiting** Received web page at " + url);
System.out.println("\n**Progress** " + progress + "%");
}
if(!connection.response().contentType().contains("text/html")) {
System.out.println("**Failure** Retrieved something other than HTML");
return false;
}
//Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]");
Elements linksOnPage = htmlDocument.select("a[href*=/dp/]");
Elements salesRank = htmlDocument.select("span.zg_hrsr_rank");
Elements category = htmlDocument.select("span.zg_hrsr_ladder a");
String categoryString = category.html();
String salesRankString = salesRank.html();
salesRankString = salesRankString.replace("\n", " ");
categoryString = categoryString.replace("\n", " ");
//System.out.println(categoryString);
System.out.println("Found (" + linksOnPage.size() + ") links");
PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true));
StringBuilder sb = new StringBuilder();
int beginIndex = url.indexOf(".de/");
int endIndex = url.indexOf("/dp");
String title = url.substring(beginIndex+4,endIndex);
if(!adjMap.containsKey(title)){
if(categoryString.contains("Horror")){
adjMap.put(title, new HashSet<String>());
sb.append(title);
sb.append(',');
sb.append(salesRankString);
sb.append(',');
sb.append(categoryString);
sb.append(',');
for(Element link : linksOnPage){
String graphLink = link.attr("abs:href");
if(!graphLink.contains("one-click")){
if(!graphLink.contains("Kindle")){
if(!graphLink.contains("unsticky")){
this.links.add(graphLink);
//adjMap.get(url).add(graphLink);
adjMap.get(title).add(cutTitle(graphLink));
sb.append(graphLink);
sb.append(',');
}
}
}
}
sb.append('\n');
pw.write(sb.toString());
pw.close();
}
}
System.out.println("done!");
return true;
}
catch(IOException ioe) {
// We were not successful in our HTTP request
System.out.println("Error in out HTTP request " + ioe);
return false;
}
}
public static void calcAdjMatrix(){
Set<String> allMyURLs = new HashSet(adjMap.keySet());
for(String s: adjMap.keySet()){
allMyURLs.addAll(adjMap.get(s));
System.out.println(s + "\t" + adjMap.get(s));
}
int dim = allMyURLs.size();
adjMatrix = new int[dim][dim];
List<String> nodes_list = new ArrayList<>();
for(String s: allMyURLs){
nodes_list.add(s);
}
for(String s: nodes_list){
Set<String> outEdges = adjMap.get(s);
int i = nodes_list.indexOf(s);
if(outEdges != null){
for(String s1: outEdges){
int j = nodes_list.indexOf(s1);
adjMatrix[i][j] = 1;
}
}
}
}
public String cutTitle(String url) throws FileNotFoundException{
int beginIndex = url.indexOf(".de/");
int endIndex = url.indexOf("/dp");
String title;
if(url.contains(".de") && url.contains("/dp")){
title = url.substring(beginIndex+4,endIndex);
}else{
title = "wrong url";
}
return title;
}
public boolean searchForWord(String searchWord) {
if(this.htmlDocument == null) {
System.out.println("ERROR! Call crawl() before performing analysis on the document");
return false;
}
System.out.println("Searching for the word " + searchWord + "...");
String bodyText = this.htmlDocument.body().text();
return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}
public List<String> getLinks(){
return this.links;
}
}
类SpiderTest:
public class SpiderTest {
public static void main(String[] args) {
Spider spider = new Spider();
spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6");
}
}
现在的问题是,在我认为100个网址之后,亚马逊禁止我从服务器上运行。该程序不再找到URL。
有没有人知道如何解决这个问题?
答案 0 :(得分:3)
那么,不要粗鲁,然后爬行。
检查他们的robots.txt(wiki),了解他们允许您执行的操作。如果你去了他们不想让你去的地方,如果他们禁止你,不要感到惊讶。
答案 1 :(得分:2)
当您尝试抓取不希望被抓取的大型网站时,此问题非常常见。他们基本上会阻止你一段时间,以防止他们的数据被抓取或被盗。
话虽如此,您有两个选择,要么从不同的IP /服务器发出每个请求,这将使您的请求看起来合法并避免禁令,或者采用最简单的方法来使用这样做的服务对你而言。
我已经完成了两个,第一个是复杂的,需要时间并需要维护(你必须建立一个服务器网络),第二个选项通常不是免费的,但实施起来很快,并保证你的所有请求都将总是返回数据,你不会被禁止。
互联网上有一些服务可以做到这一点。我过去使用proxycrawl(也有一个免费等级),效果非常好。他们有一个可以调用的API,您只能使用相同的代码,只需更改您调用的网址即可。
这是amazon的一个例子:
GET https://api.proxycrawl.com?token=yourtoken&url=https://amazon.com
你会得到一个回应,即使你每秒爬1000页,你也永远不会被禁止,因为你会调用那个代理而不是为你做所有的魔法。
我希望它有所帮助:)
答案 2 :(得分:0)
您可以尝试使用代理服务器来防止被阻止。有提供工作代理的服务。我有很好的使用https://gimmeproxy.com的经验,它专门有代理支持亚马逊。
要让代理与Amazon合作,您只需提出以下请求:
https://gimmeproxy.com/api/getProxy?api_key=your_api_key&websites=amazon
您将获得所有代理数据的JSON响应,您可以在以后根据需要使用这些数据:
{
"supportsHttps": true,
"protocol": "socks5",
"ip": "116.182.122.182",
"port": "1915",
"get": true,
"post": true,
"cookies": true,
"referer": true,
"user-agent": true,
"anonymityLevel": 1,
"websites": {
"example": true,
"google": false,
"amazon": true
},
"country": "BR",
"tsChecked": 1517952910,
"curl": "socks5://116.182.122.182:1915",
"ipPort": "116.182.122.182:1915",
"type": "socks5",
"speed": 37.78,
"otherProtocols": {}
}