我正在使用HtmlCleaner来解析html文档并遇到了一个小问题:
在htmlcleaner properties guide它说如果我将useCdata标志设置为false,它将在脚本和样式标记中搜索html。好的,我在这里:
scala> val cleanerProps = new CleanerProperties()
cleanerProps: org.htmlcleaner.CleanerProperties = org.htmlcleaner.CleanerProperties@203e9b48
scala> cleanerProps.setUseCdataForScriptAndStyle(false)
scala> val clnr = new HtmlCleaner(cleanerProps)
clnr: org.htmlcleaner.HtmlCleaner = org.htmlcleaner.HtmlCleaner@4a5800a9
scala> val test = """<script language="javascript">
| document.write('<h1>Obviously a heading</h1>')
| </script>"""
test: java.lang.String =
<script language="javascript">
document.write('<h1>Obviously a heading</h1>')
</script>
scala> clnr.clean(test).getElementsByName("h1", true)
res61: Array[org.htmlcleaner.TagNode] = Array()
htmlcleaner不应该找到h1吗?为了使问题更加混乱,以下工作正常:
scala> val test2 = """document.write('<h1>Obviously a heading</h1>')"""
test2: java.lang.String =
"document.write('<h1>Obviously a heading</h1>')"
scala> clnr.clean(test2).getElementsByName("h1", true)
res62: Array[org.htmlcleaner.TagNode] = Array(h1)
或
scala> clnr.clean(test.replaceAllLiterally("script","style")).getElementsByName("h1", true)
res65: Array[org.htmlcleaner.TagNode] = Array(h1)
???