是否有可能在获取html页面的全文(使用tika或jsoup)时,可以在每个' li'之间进行回车。元件?
今天我以紧凑的方式提供所有文字。
由于
答案 0 :(得分:0)
这是Andrew Phillips的改进版本。
<强>爪哇强>
package com.github.davidepastore.stackoverflow33947074;
import java.io.IOException;
import java.io.InputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
/**
* Stackoverflow 33947074
*
*/
public class App
{
public static void main( String[] args ) throws IOException {
ClassLoader classloader = Thread.currentThread()
.getContextClassLoader();
InputStream is = classloader.getResourceAsStream("file.html");
Document document = Jsoup.parse(is, "UTF-8", "");
Element element = document.select("html").first();
String text = getText(element);
System.out.println("Result: " + text);
}
/**
* Get the custom text from the given {@link Element}.
* @param element The {@link Element} from which get the custom text.
* @return Returns the custom text.
*/
private static String getText(Element element) {
String working = "";
for (Node child : element.childNodes()) {
if (child instanceof TextNode) {
working += ((TextNode) child).text();
}
if (child instanceof Element) {
Element childElement = (Element)child;
if (childElement.tag().getName().equalsIgnoreCase("li")) {
working += "\n";
}
working += getText(childElement);
}
}
return working;
}
}
<强> file.html 强>
<html>
<head>
<title>Try jsoup</title>
</head>
<body>
<p>This is <a href="http://jsoup.org/">jsoup</a>.</p>
<ul>
<li>First element</li>
<li><a href="#">Second element</a></li>
<li>Third element <b>Additional for third element</b></li>
</ul>
</body>
</html>
<强>输出强>
Result: Try jsoup This is jsoup.
First element
Second element
Third element Additional for third element