我想使用任何网络抓取/抓取方法从网站中提取所有文章内容。
问题是我可以从单个页面获取内容,但不能从重定向链接获取内容。 任何人请给我适当的解决方案
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main3 {
public static void main(String[] argv) throws Exception {
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
};
}
};
URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream("ram.txt"), "UTF-8");
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
try {
Document docs = Jsoup.connect("http://tamilblog.ishafoundation.org/").get();
Elements links = docs.select("a[href]");
Elements elements = docs.select("*");
System.out.println("Total Links :"+links.size());
for (Element element : elements) {
System.out.println(element.ownText());
}
for (Element link : links) {
System.out.println(" * a: link :"+ link.attr("a:href"));
System.out.println(" * a: text :"+ link.text());
System.out.println(" * a: text :"+ link.text());
System.out.println(" * a: Alt :"+ link.attr("alt"));
System.out.println(link.attr("p"));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}`
答案 0 :(得分:0)
您应该使用现有的抓取工具,例如Apache Nutch或StormCrawler。
答案 1 :(得分:0)
这是解决方案:
package com.github.davidepastore.stackoverflow34014436;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Stackoverflow 34014436 question.
*
*/
public class App {
public static void main(String[] args) throws URISyntaxException,
IOException, BadLocationException {
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
};
}
};
URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
OutputStreamWriter writer = new OutputStreamWriter(
new FileOutputStream("ram.txt"), "UTF-8");
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
try {
Document docs = Jsoup.connect(
"http://tamilblog.ishafoundation.org/").get();
Elements links = docs.select("a[href]");
Elements elements = docs.select("*");
System.out.println("Total Links :" + links.size());
for (Element element : elements) {
System.out.println(element.ownText());
}
for (Element link : links) {
String hrefUrl = link.attr("href");
if (!"#".equals(hrefUrl) && !hrefUrl.isEmpty()) {
System.out.println(" * a: link :" + hrefUrl);
System.out.println(" * a: text :" + link.text());
writer.write(link.text() + " => " + hrefUrl + "\n");
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
writer.close();
}
}
}
这里我们使用writer
来编写ram.txt
文件中每个链接的文本。