扫描给定格式Jsoup Java的网站链接

时间:2015-10-15 04:43:50

标签: java jsoup

我尝试根据选择器级别尝试扫描网页中的所有链接。我已经给出了

这是我的陈述,我已经阅读了选择器某种方式修复,我想在循环中读取更多,递归或任何帮助我变得更灵活的命令级别选择器我的未来可能大于2

public static void main(String[] args) {
        String website = website("http://www.java2s.com/");
        System.out.println(website);
    }

    private static String website(String url) {
        String lstLink = "";
        try {
            String level[] = {"div.col-md-9 li a", "div#sidebar ul li a"};
            //Level 1
            Document connect = Jsoup.connect(url).get();
            Elements selectLevel1 = connect.select(level[0]);
            for (Element level1 : selectLevel1) {
                lstLink += level1.attr("href") + "\n";

                //Level2
                Document connect2 = Jsoup.connect(level1.attr("href")).get();
                Elements selectLevel2 = connect2.select(level[1]);
                for (Element level2 : selectLevel2) {
                    lstLink += level2.attr("href") + "\n";
                }
            }
        } catch (IOException ex) {
            Logger.getLogger(AWebsite.class.getName()).log(Level.SEVERE, null, ex);
        }
        return lstLink;

    }

1 个答案:

答案 0 :(得分:4)

请检查。

    static String levels[] = {"div.col-md-9 li a", "div#sidebar ul li a"};

    private static String getRecursive(String href, int level) {

         String links = "";

         if (level > levels.length-1) {
             return "";
         }

         Document doc;
         try {
             doc = Jsoup.connect(href).get();
             Elements elements = doc.select(levels[level]);

             level++;

             for (Element element : elements) {
                 if(!element.attr("href").isEmpty())
                 {
                     links += element.attr("abs:href") + "\n";
                     links += getRecursive(element.attr("abs:href"), level);
                 }
             }
         } catch (IOException e1) {
             e1.printStackTrace();
         }
         return links;
     }



public static void main(String[] args) {
    String website = getRecursive("http://www.java2s.com/", 0);
    System.out.println(website);
}