import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Scraper {
private String filePath = "c://reddit//";
private String url;
private int count;
private String after;
private static String subreddit;
public Scraper(String sr) {
url = String.format("http://www.reddit.com/r/%s.xml?limit=100", sr);
}
public static void main(String[] args) {
Scanner input = new Scanner(System.in);
System.out.println("enter subreddit with pics only");
subreddit = input.next();
System.out.println("enter amount of pages to crawl");
int pages = input.nextInt();
Scraper scraper = new Scraper(subreddit);
input.close();
int i = 0;
while (i < pages) {
scraper.getNextPage();
scraper.getImgur();
scraper.getImgurA();
scraper.getImgurAddI();
i++;
}
}
public void download(String _url, String name) {
/*
* setup streams.. write image as bytes to filePath
*/
InputStream is = null;
OutputStream os = null;
try {
URL url = new URL(_url);
is = url.openStream();
os = new FileOutputStream(filePath + name + ".jpg");
for (int b; (b = is.read()) != -1;) {
os.write(b);
}
} catch (MalformedURLException mue) {
System.out.println("invalid url");
} catch (IOException e) {
System.out.println("no stream");
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (os != null) {
try {
os.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void getImgur() {
/*
* grab all imgur's in the context of http://i.imgur.com/. The second
* parameter to download() is the filename
*/
try {
System.out.println("connecting to imgur");
Elements description = getSubreddit();
for (Element imgur : description) {
Pattern pattern = Pattern
.compile("http://i\\.imgur\\.com/\\w+");
Matcher matcher = pattern.matcher(imgur.text());
if (matcher.find()) {
System.out.println("downloading image: " + " "
+ matcher.group());
download((matcher.group() + ".jpg"), matcher.group()
.substring(18));
}
}
} catch (Exception e) {
System.out.println("getImgur() failed");
} finally {
System.out.println("grabbed all imgurs");
}
}
public void getImgurAddI() {
/*
* grab all imgur's in the context of http://imgur.com/, if it is an
* album then skip otherwise add "i" to beginning of imgur in order to
* get image
*/
try {
System.out.println("finding imgurs without prefix i and adding i");
Elements description = getSubreddit();
for (Element imgur : description) {
Pattern pattern = Pattern.compile("http://imgur\\.com/\\w+");
Matcher matcher = pattern.matcher(imgur.text());
if (matcher.find()) {
if (!matcher.group().endsWith("a")) {
// make imgur downloadable by adding 'i' before imgur
String newUrl = matcher.group();
newUrl = "http://i." + newUrl.substring(7);
download(newUrl + ".jpg", newUrl.substring(18));
}
}
}
} catch (Exception e) {
System.out.println("getImgurAddI() failed");
} finally {
System.out.println("grabbed all imgurs by adding I");
}
}
private void getImgurA() {
/*
* grab all albums then call extract() to get each individual image
*/
try {
System.out.println("connecting to imgur album");
Elements description = getSubreddit();
for (Element imgur : description) {
Pattern pattern = Pattern.compile("http://imgur.com/a/\\w+");
Matcher matcher = pattern.matcher(imgur.text());
if (matcher.find()) {
System.out.println("Downloading image album...." + " "
+ matcher.group());
extract(matcher.group());
}
}
} catch (Exception e) {
System.out.println("getImgurA() failed");
} finally {
System.out.println("extracted all imgur albums");
}
}
private void extract(String album) {
/*
* open connection to imgur album and download each individual image,
* validate imgur..if it ends with "s" most likely a thumbnail duplicate
* so skip it
*/
try {
Document doc = Jsoup.connect(album).get();
Elements pics = doc.getElementsByTag("img");
String image = null;
for (Element pic : pics) {
/*
* get all image's inside the data-src attribute, make sure url
* is valid first
*/
image = pic.attr("data-src");
if (image != ""
&& (!image.substring(0, image.length() - 4).endsWith(
"s"))) {
if (image.endsWith(".jpg?1") || image.endsWith(".jpg?2")) {
if (image.substring(2, image.length() - 6)
.endsWith("s")) {
System.out
.println("skipping download of thumbnail/duplicate");
} else {
System.out.println("extracting jpg1/jpg2..... "
+ image.substring(2));
download(
"http://"
+ image.substring(2,
image.length() - 2),
image.substring(14, image.length() - 6));
}
} else {
System.out.println("extracting..... "
+ image.substring(2));
download("http://" + image.substring(2),
image.substring(14));
}
}
}
} catch (IOException e) {
System.out.println("extract() failed");
}
}
public Elements getSubreddit() {
/*
* return an Elements with the information to be scraped
* to caller method, setup user agent
*/
Document doc;
Elements description = null;
try {
doc = Jsoup
.connect(url)
.userAgent(
"Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6")
.referrer("http://www.google.com").get();
description = doc.getElementsByTag("description");
} catch (IOException e) {
System.out.println("getSubreddit() failed");
}
return description;
}
public void getNextPage() {
/*
* crawls current url to get next url
*/
System.out.println("Crawling next page..............");
Document doc;
try {
url = url.replace(".xml", "");
doc = Jsoup.connect(url).get();
Elements next = doc.getElementsByTag("span");
for (Element n : next) {
if (n.className().equals("nextprev")) {
Pattern pattern = Pattern.compile("after=\\w+");
Matcher matcher = pattern.matcher(n.toString());
if (matcher.find()) {
after = matcher.group().substring(6);
count += 100;
url = String
.format("http://www.reddit.com/r/%s.xml?limit=100&count=%d&after=%s",
subreddit, count, after);
System.out.println("Crawling page.........: " + url);
}
}
}
} catch (IOException e) {
System.out.println("getNextPage() failed");
}
}
}
对不起,这可能很难读,我还没有打破它,因为我还在解决问题。它似乎在没有单个错误的情况下抓取9个页面而不是每个页面,之后失败,问题是“套接字超时”或“连接超时”。以下是尝试抓取25页http://pastebin.com/sP9UwGk9
的示例输出。更大的问题是我实际上它是一个多线程爬虫,但它失败了50倍,所以我慢了。每次网站连接或开始下载时我都会添加一堆Thread.sleep,但我仍然遇到错误。有什么我做错了吗?我知道reddit有某种类型的限制器,但我不确定是什么问题,因为这个程序起初很慢(除非我使用线程)。
编辑:控制台日志 http://pastebin.com/fhrjSeKx
答案 0 :(得分:0)
大多数reddit列表仅返回约1000个项目。由于您的limit
参数设置为100
,因此您将在10页后到达列表末尾,这就是您无法获取任何其他数据的原因。
如果您需要subreddit中的其他项目,您可能需要从其他商家信息中提取数据或search API