我想从网站的所有链接中获取所有文章内容

时间:2015-12-01 07:00:54

标签: java url web-crawler html-parsing jsoup

我想使用任何网络抓取/抓取方法从网站中提取所有文章内容。

问题是我可以从单个页面获取内容,但不能从重定向链接获取内容。 任何人请给我适当的解决方案

import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;

import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main3 {
  public static void main(String[] argv) throws Exception {
    HTMLDocument doc = new HTMLDocument() {
      public HTMLEditorKit.ParserCallback getReader(int pos) {
        return new HTMLEditorKit.ParserCallback() {
          public void handleText(char[] data, int pos) {
            System.out.println(data);
          }
        };
      }
    };

    URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
    URLConnection conn = url.openConnection();
    Reader rd = new InputStreamReader(conn.getInputStream());
    OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream("ram.txt"), "UTF-8");

    EditorKit kit = new HTMLEditorKit();
    kit.read(rd, doc, 0);
    try {
        Document docs = Jsoup.connect("http://tamilblog.ishafoundation.org/").get();

         Elements links = docs.select("a[href]");

         Elements elements = docs.select("*");
         System.out.println("Total Links :"+links.size());



         for (Element element : elements) {
             System.out.println(element.ownText());
         }
         for (Element link : links) {
            System.out.println(" * a: link :"+ link.attr("a:href"));
             System.out.println(" * a: text :"+ link.text());

            System.out.println(" * a: text :"+ link.text());
          System.out.println(" * a: Alt :"+ link.attr("alt"));
        System.out.println(link.attr("p"));
        }


    } catch (Exception e) {
        e.printStackTrace();
    }


  }
  }`

2 个答案:

答案 0 :(得分:0)

您应该使用现有的抓取工具,例如Apache NutchStormCrawler

答案 1 :(得分:0)

这是解决方案:

package com.github.davidepastore.stackoverflow34014436;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;

import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * Stackoverflow 34014436 question.
 *
 */
public class App {
    public static void main(String[] args) throws URISyntaxException,
            IOException, BadLocationException {
        HTMLDocument doc = new HTMLDocument() {
            public HTMLEditorKit.ParserCallback getReader(int pos) {
                return new HTMLEditorKit.ParserCallback() {
                    public void handleText(char[] data, int pos) {
                        System.out.println(data);
                    }
                };
            }
        };

        URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
        URLConnection conn = url.openConnection();
        Reader rd = new InputStreamReader(conn.getInputStream());
        OutputStreamWriter writer = new OutputStreamWriter(
                new FileOutputStream("ram.txt"), "UTF-8");

        EditorKit kit = new HTMLEditorKit();
        kit.read(rd, doc, 0);
        try {
            Document docs = Jsoup.connect(
                    "http://tamilblog.ishafoundation.org/").get();

            Elements links = docs.select("a[href]");

            Elements elements = docs.select("*");
            System.out.println("Total Links :" + links.size());

            for (Element element : elements) {
                System.out.println(element.ownText());
            }
            for (Element link : links) {
                String hrefUrl = link.attr("href");
                if (!"#".equals(hrefUrl) && !hrefUrl.isEmpty()) {
                    System.out.println(" * a: link :" + hrefUrl);
                    System.out.println(" * a: text :" + link.text());
                    writer.write(link.text() + " => " + hrefUrl + "\n");
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            writer.close();
        }
    }
}

这里我们使用writer来编写ram.txt文件中每个链接的文本。