大多数已完成的reddit抓取工具无法抓取超过9个左右的页面

时间:2013-12-24 22:16:49

标签: java web-crawler reddit

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Scraper {
    private String filePath = "c://reddit//";
    private String url;
    private int count;
    private String after;
    private static String subreddit;

    public Scraper(String sr) {
        url = String.format("http://www.reddit.com/r/%s.xml?limit=100", sr);
    }

    public static void main(String[] args) {
        Scanner input = new Scanner(System.in);
        System.out.println("enter subreddit with pics only");
        subreddit = input.next();
        System.out.println("enter amount of pages to crawl");
        int pages = input.nextInt();
        Scraper scraper = new Scraper(subreddit);
        input.close();
        int i = 0;
        while (i < pages) {
            scraper.getNextPage();
            scraper.getImgur();
            scraper.getImgurA();
            scraper.getImgurAddI();
            i++;
        }

    }

    public void download(String _url, String name) {
        /*
         * setup streams.. write image as bytes to filePath
         */
        InputStream is = null;
        OutputStream os = null;
        try {
            URL url = new URL(_url);
            is = url.openStream();
            os = new FileOutputStream(filePath + name + ".jpg");
            for (int b; (b = is.read()) != -1;) {
                os.write(b);
            }
        } catch (MalformedURLException mue) {
            System.out.println("invalid url");
        } catch (IOException e) {
            System.out.println("no stream");
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (os != null) {
                try {
                    os.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    public void getImgur() {
        /*
         * grab all imgur's in the context of http://i.imgur.com/. The second
         * parameter to download() is the filename
         */
        try {
            System.out.println("connecting to imgur");
            Elements description = getSubreddit();
            for (Element imgur : description) {
                Pattern pattern = Pattern
                        .compile("http://i\\.imgur\\.com/\\w+");
                Matcher matcher = pattern.matcher(imgur.text());
                if (matcher.find()) {
                    System.out.println("downloading image: " + " "
                            + matcher.group());
                    download((matcher.group() + ".jpg"), matcher.group()
                            .substring(18));
                }
            }
        } catch (Exception e) {
            System.out.println("getImgur() failed");
        } finally {
            System.out.println("grabbed all imgurs");
        }

    }

    public void getImgurAddI() {
        /*
         * grab all imgur's in the context of http://imgur.com/, if it is an
         * album then skip otherwise add "i" to beginning of imgur in order to
         * get image
         */
        try {
            System.out.println("finding imgurs without prefix i and adding i");
            Elements description = getSubreddit();
            for (Element imgur : description) {
                Pattern pattern = Pattern.compile("http://imgur\\.com/\\w+");
                Matcher matcher = pattern.matcher(imgur.text());
                if (matcher.find()) {
                    if (!matcher.group().endsWith("a")) {
                        // make imgur downloadable by adding 'i' before imgur
                        String newUrl = matcher.group();
                        newUrl = "http://i." + newUrl.substring(7);
                        download(newUrl + ".jpg", newUrl.substring(18));
                    }
                }
            }

        } catch (Exception e) {
            System.out.println("getImgurAddI() failed");
        } finally {
            System.out.println("grabbed all imgurs by adding I");
        }

    }

    private void getImgurA() {
        /*
         * grab all albums then call extract() to get each individual image
         */
        try {
            System.out.println("connecting to imgur album");
            Elements description = getSubreddit();
            for (Element imgur : description) {
                Pattern pattern = Pattern.compile("http://imgur.com/a/\\w+");
                Matcher matcher = pattern.matcher(imgur.text());
                if (matcher.find()) {
                    System.out.println("Downloading image album...." + " "
                            + matcher.group());
                    extract(matcher.group());
                }
            }
        } catch (Exception e) {
            System.out.println("getImgurA() failed");
        } finally {
            System.out.println("extracted all imgur albums");
        }
    }

    private void extract(String album) {
        /*
         * open connection to imgur album and download each individual image,
         * validate imgur..if it ends with "s" most likely a thumbnail duplicate
         * so skip it
         */
        try {
            Document doc = Jsoup.connect(album).get();
            Elements pics = doc.getElementsByTag("img");
            String image = null;
            for (Element pic : pics) {
                /*
                 * get all image's inside the data-src attribute, make sure url
                 * is valid first
                 */
                image = pic.attr("data-src");
                if (image != ""
                        && (!image.substring(0, image.length() - 4).endsWith(
                                "s"))) {
                    if (image.endsWith(".jpg?1") || image.endsWith(".jpg?2")) {
                        if (image.substring(2, image.length() - 6)
                                .endsWith("s")) {
                            System.out
                                    .println("skipping download of thumbnail/duplicate");
                        } else {
                            System.out.println("extracting jpg1/jpg2..... "
                                    + image.substring(2));
                            download(
                                    "http://"
                                            + image.substring(2,
                                                    image.length() - 2),
                                    image.substring(14, image.length() - 6));
                        }
                    } else {
                        System.out.println("extracting..... "
                                + image.substring(2));
                        download("http://" + image.substring(2),
                                image.substring(14));
                    }
                }
            }
        } catch (IOException e) {
            System.out.println("extract() failed");
        }

    }

    public Elements getSubreddit() {
        /*
         * return an Elements with the information to be scraped
         *  to caller method, setup user agent
         */
        Document doc;
        Elements description = null;
        try {
            doc = Jsoup
                    .connect(url)
                    .userAgent(
                            "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6")
                    .referrer("http://www.google.com").get();
            description = doc.getElementsByTag("description");
        } catch (IOException e) {
            System.out.println("getSubreddit() failed");
        }
        return description;
    }

    public void getNextPage() {
        /*
         * crawls current url to get next url
         */
        System.out.println("Crawling next page..............");
        Document doc;
        try {
            url = url.replace(".xml", "");
            doc = Jsoup.connect(url).get();
            Elements next = doc.getElementsByTag("span");
            for (Element n : next) {
                if (n.className().equals("nextprev")) {
                    Pattern pattern = Pattern.compile("after=\\w+");
                    Matcher matcher = pattern.matcher(n.toString());
                    if (matcher.find()) {
                        after = matcher.group().substring(6);
                        count += 100;
                        url = String
                                .format("http://www.reddit.com/r/%s.xml?limit=100&count=%d&after=%s",
                                        subreddit, count, after);
                        System.out.println("Crawling page.........: " + url);

                    }
                }
            }
        } catch (IOException e) {
            System.out.println("getNextPage() failed");

        }
    }
}

对不起,这可能很难读,我还没有打破它,因为我还在解决问题。它似乎在没有单个错误的情况下抓取9个页面而不是每个页面,之后失败,问题是“套接字超时”或“连接超时”。以下是尝试抓取25页http://pastebin.com/sP9UwGk9的示例输出。更大的问题是我实际上它是一个多线程爬虫,但它失败了50倍,所以我慢了。每次网站连接或开始下载时我都会添加一堆Thread.sleep,但我仍然遇到错误。有什么我做错了吗?我知道reddit有某种类型的限制器,但我不确定是什么问题,因为这个程序起初很慢(除非我使用线程)。

编辑:控制台日志 http://pastebin.com/fhrjSeKx

1 个答案:

答案 0 :(得分:0)

大多数reddit列表仅返回约1000个项目。由于您的limit参数设置为100,因此您将在10页后到达列表末尾,这就是您无法获取任何其他数据的原因。

如果您需要subreddit中的其他项目,您可能需要从其他商家信息中提取数据或search API