所以我试图用jsoup来刮掉Reddit的图片,但是当我刮掉某些subreddits如/ r / wallpaper时,我得到了429错误,我想知道如何解决这个问题。完全理解这段代码很糟糕,这是一个非常棒的问题,但我对此完全陌生。反正:
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.io.*;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
import java.util.Scanner;
public class javascraper{
public static void main (String[]args) throws MalformedURLException
{
Scanner scan = new Scanner (System.in);
System.out.println("Where do you want to store the files?");
String folderpath = scan.next();
System.out.println("What subreddit do you want to scrape?");
String subreddit = scan.next();
subreddit = ("http://reddit.com/r/" + subreddit);
new File(folderpath + "/" + subreddit).mkdir();
//test
try{
//gets http protocol
Document doc = Jsoup.connect(subreddit).timeout(0).get();
//get page title
String title = doc.title();
System.out.println("title : " + title);
//get all links
Elements links = doc.select("a[href]");
for(Element link : links){
//get value from href attribute
String checkLink = link.attr("href");
Elements images = doc.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
if (imgCheck(checkLink)){ // checks to see if img link j
System.out.println("link : " + link.attr("href"));
downloadImages(checkLink, folderpath);
}
}
}
catch (IOException e){
e.printStackTrace();
}
}
public static boolean imgCheck(String http){
String png = ".png";
String jpg = ".jpg";
String jpeg = "jpeg"; // no period so checker will only check last four characaters
String gif = ".gif";
int length = http.length();
if (http.contains(png)|| http.contains("gfycat") || http.contains(jpg)|| http.contains(jpeg) || http.contains(gif)){
return true;
}
else{
return false;
}
}
private static void downloadImages(String src, String folderpath) throws IOException{
String folder = null;
//Exctract the name of the image from the src attribute
int indexname = src.lastIndexOf("/");
if (indexname == src.length()) {
src = src.substring(1, indexname);
}
indexname = src.lastIndexOf("/");
String name = src.substring(indexname, src.length());
System.out.println(name);
//Open a URL Stream
URL url = new URL(src);
InputStream in = url.openStream();
OutputStream out = new BufferedOutputStream(new FileOutputStream( folderpath+ name));
for (int b; (b = in.read()) != -1;) {
out.write(b);
}
out.close();
in.close();
}
}
答案 0 :(得分:5)
您的问题是由于您的刮刀违反了reddit's API rules。错误429表示“请求太多” - 您请求的页面太快。
您可以每2秒发出一个请求,还需要设置正确的user agent(他们推荐的格式为<platform>:<app ID>:<version string> (by /u/<reddit username>)
)。它目前的样子,你的代码运行得太快而且没有指定代码,因此它将受到严格的速率限制。
要修复它,首先,在主方法之前将其添加到类的开头:
public static final String USER_AGENT = "<PUT YOUR USER AGENT HERE>";
(确保指定实际的用户代理)。
然后,更改此项(在downloadImages
)
URL url = new URL(src);
InputStream in = url.openStream();
到此:
URLConnection connection = (new URL(src)).openConnection();
Thread.sleep(2000); //Delay to comply with rate limiting
connection.setRequestProperty("User-Agent", USER_AGENT);
InputStream in = connection.getInputStream();
您还需要更改此内容(在main
)
Document doc = Jsoup.connect(subreddit).timeout(0).get();
到此:
Document doc = Jsoup.connect(subreddit).userAgent(USER_AGENT).timeout(0).get();
然后你的代码应该停止遇到那个错误。
请注意,使用reddit's API(IE,/ r / subsreddit.json而不是/ r / subreddit)可能会使这个项目更容易,但它不是必需的,并且您当前的代码将起作用。
答案 1 :(得分:2)