我正在使用带有Whitelist
自定义配置的jsoup 1.7.3。
显然它会清理文档中的所有HTML注释(<!-- ... -->
)。
它还会清理<!DOCTYPE ...>
元素。
Whitelist
按原样发表评论?!DOCTYPE
元素定义为包含任何属性的允许元素?答案 0 :(得分:3)
标准的JSoup类不可能这样做,它不依赖于WhiteList。它是org.jsoup.safety.Cleaner
。清理器使用Node traverser,它只允许元素和文本节点。也只解析了身体。所以head和doctype完全被忽略了。因此,要实现这一目标,您必须创建一个自定义清洁器。例如,如果你有像
<!DOCTYPE html>
<html>
<head>
<!-- This is a script -->
<script type="text/javascript">
function newFun() {
alert(1);
}
</script>
</head>
<body>
<map name="diagram_map">
<area id="area1" />
<area id="area2" />
</map>
<!-- This is another comment. -->
<div>Test</div>
</body>
</html>
您将首先创建一个复制原始清洁工的自定义清洁工。但请注意,软件包应org.jsoup.safety
,因为清理程序使用与白名单相关联的一些受保护方法。此外,没有必要扩展Cleaner,因为几乎所有方法都是私有的,内部节点遍历是最终的。
package org.jsoup.safety;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
public class CustomCleaner {
private Whitelist whitelist;
public CustomCleaner(Whitelist whitelist) {
Validate.notNull(whitelist);
this.whitelist = whitelist;
}
public Document clean(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
copyDocType(dirtyDocument, clean);
if (dirtyDocument.head() != null)
copySafeNodes(dirtyDocument.head(), clean.head());
if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
copySafeNodes(dirtyDocument.body(), clean.body());
return clean;
}
private void copyDocType(Document dirtyDocument, Document clean) {
dirtyDocument.traverse(new NodeVisitor() {
public void head(Node node, int depth) {
if (node instanceof DocumentType) {
clean.prependChild(node);
}
}
public void tail(Node node, int depth) { }
});
}
public boolean isValid(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
return numDiscarded == 0;
}
private final class CleaningVisitor implements NodeVisitor {
private int numDiscarded = 0;
private final Element root;
private Element destination; // current element to append nodes to
private CleaningVisitor(Element root, Element destination) {
this.root = root;
this.destination = destination;
}
public void head(Node source, int depth) {
if (source instanceof Element) {
Element sourceEl = (Element) source;
if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
ElementMeta meta = createSafeElement(sourceEl);
Element destChild = meta.el;
destination.appendChild(destChild);
numDiscarded += meta.numAttribsDiscarded;
destination = destChild;
} else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
numDiscarded++;
}
} else if (source instanceof TextNode) {
TextNode sourceText = (TextNode) source;
TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
destination.appendChild(destText);
} else if (source instanceof Comment) {
Comment sourceComment = (Comment) source;
Comment destComment = new Comment(sourceComment.getData(), source.baseUri());
destination.appendChild(destComment);
} else if (source instanceof DataNode) {
DataNode sourceData = (DataNode) source;
DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri());
destination.appendChild(destData);
} else { // else, we don't care about comments, xml proc instructions, etc
numDiscarded++;
}
}
public void tail(Node source, int depth) {
if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {
destination = destination.parent(); // would have descended, so pop destination stack
}
}
}
private int copySafeNodes(Element source, Element dest) {
CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
NodeTraversor traversor = new NodeTraversor(cleaningVisitor);
traversor.traverse(source);
return cleaningVisitor.numDiscarded;
}
private ElementMeta createSafeElement(Element sourceEl) {
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
destAttrs.put(sourceAttr);
else
numDiscarded++;
}
Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
destAttrs.addAll(enforcedAttrs);
return new ElementMeta(dest, numDiscarded);
}
private static class ElementMeta {
Element el;
int numAttribsDiscarded;
ElementMeta(Element el, int numAttribsDiscarded) {
this.el = el;
this.numAttribsDiscarded = numAttribsDiscarded;
}
}
}
一旦你们两个都可以正常清洁。像
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.CustomCleaner;
import org.jsoup.safety.Whitelist;
public class CustomJsoupSanitizer {
public static void main(String[] args) {
try {
Document doc = Jsoup.parse(new File("t2.html"), "UTF-8");
CustomCleaner cleaner = new CustomCleaner(Whitelist.relaxed().addTags("script"));
Document doc2 = cleaner.clean(doc);
System.out.println(doc2.html());
} catch (IOException e) {
e.printStackTrace();
}
}
}
这将为您提供上面html的清理输出
<!DOCTYPE html>
<html>
<head>
<!-- This is a script -->
<script>
function newFun() {
alert(1);
}
</script>
</head>
<body>
<!-- This is another comment. -->
<div>
Test
</div>
</body>
</html>
您可以自定义清洁剂以符合您的要求。即避免头节点或脚本标记等...
答案 1 :(得分:0)
Jsoup Cleaner在这里没有给你机会(l.100):
} else { // else, we don't care about comments, xml proc instructions, etc
numDiscarded++;
}
只有Element
和TextNode
的实例可能会保留在已清理的HTML中。
您唯一的机会可能是解析文档,使用特殊的白名单替换注释和doctype,清理文档然后再次解析和替换特殊标记。