如何在网站上获取所有超链接及其段落?

时间:2015-12-10 08:41:26

标签: java web-scraping web-crawler jsoup urlconnection

我想获取所有超链接并将其命名为.txt文件,我想将所有段落存储在每个超链接中,并按文章标题保存为文本文件。

我在这里有代码,我正在解决这个问题2个月。我无法获得此爬行/抓取逻辑的代码。 任何人都请编码并修复它。

  import java.io.FileOutputStream;
    import java.io.IOException;
     import java.io.InputStreamReader;
    import java.io.OutputStreamWriter;
   import java.io.Reader;
   import java.net.URI;
   import java.net.URISyntaxException;
    import java.net.URL;
     import java.net.URLConnection;

      import javax.swing.text.BadLocationException;
     import javax.swing.text.EditorKit;
     import javax.swing.text.html.HTMLDocument;
   import javax.swing.text.html.HTMLEditorKit;

     import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
      import org.jsoup.nodes.Element;
     import org.jsoup.select.Elements;

     public class App {
     public static void main(String[] args) throws URISyntaxException,
        IOException, BadLocationException {
    HTMLDocument doc = new HTMLDocument() {
        public HTMLEditorKit.ParserCallback getReader(int pos) {
            return new HTMLEditorKit.ParserCallback() {
                public void handleText(char[] data, int pos) {
                    System.out.println(data);
                }
            };
        }
    };

    URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
    URLConnection conn = url.openConnection();
    Reader rd = new InputStreamReader(conn.getInputStream());
    OutputStreamWriter writer = new OutputStreamWriter(
            new FileOutputStream("ram.txt"), "UTF-8");

    EditorKit kit = new HTMLEditorKit();
    kit.read(rd, doc, 0);
    try {
        Document docs = Jsoup.connect(
                "http://tamilblog.ishafoundation.org/").get();

        Elements links = docs.select("a[href]");

        Elements elements = docs.select("*");
        System.out.println("Total Links :" + links.size());

        for (Element element : elements) {
            System.out.println(element.ownText());
        }
        for (Element link : links) {
            String hrefUrl = link.attr("href");
            if (!"#".equals(hrefUrl) && !hrefUrl.isEmpty()) {
                System.out.println(" * a: link :" + hrefUrl);
                System.out.println(" * a: text :" + link.text());
                Document document = Jsoup.connect(hrefUrl)
                        .timeout(0) //Infinite timeout
                        .get();
                String html = document.toString();
                writer.write(html);
            }
        }

    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
    }
}
 }

1 个答案:

答案 0 :(得分:0)

尝试这样的事情

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class NewClass {


  public static void main(String[] args) throws IOException {


      Document doc = Jsoup.connect("http://tamilblog.ishafoundation.org").get();
      Elements section = doc.select("section#content");
      Elements article = section.select("article");
      for (Element a : article) {
        System.out.println("Title : \n" + a.select("a").text());
        System.out.println("Article summary: \n" + a.select("div.entry-summary").text());
      }
    }


}