目前,我可以打印网页的所有网址,但无法打印网址上提供的文字....
例如:
<a class="fbl" href="/preferences?hl=en" jsaction="foot.cst" id="fsettl">Settings</a>
代码只能打印“/ preferences?hl = en”,但不能打印链接文本,即设置....
public static List getLinks(String uriStr) {
List result = new ArrayList<String>();
//create a reader on the html content
try{
System.out.println("in the getlinks try");
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
// Parse the HTML
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument)kit.createDefaultDocument();
kit.read(rd, doc, 0);
// Find all the A elements in the HTML document
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
SimpleAttributeSet s = (SimpleAttributeSet)it.getAttributes();
String link = (String)s.getAttribute(HTML.Attribute.HREF);
if (link != null) {
// Add the link to the result list
System.out.println(link);
//System.out.println("link print finished");
result.add(link);
}
//System.out.println(link);
it.next();
}
}
我如何打印网址的内容?
答案 0 :(得分:0)
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.Iterator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class PrintURL {
public static void main(String[] args) throws Exception
{
Reader r = null;
try {
URL u = new URL("https://www.google.co.in/");
// URL u = new URL(args[0]);
InputStream in = u.openStream();
r = new InputStreamReader(in);
Document jsoup = Jsoup.connect("https://www.google.co.in/").get();
Elements aHref = jsoup.getElementsByTag("a");
Iterator<Element> iterator = aHref.iterator();
while (iterator.hasNext())
{
Element element = iterator.next();
System.out.println("\nLink: " + element.attr("href"));
System.out.println("Link Name: " + element.text());
}
} finally {
if (r != null) {
r.close();
}
}
}
}