我用javax.swing.text.ElementIterator()得到了一个奇怪的行为。它从不显示所有元素,它根据我使用的ParserCallback类型显示不同数量的元素。下面的测试是通过我的个人资料中的网站完成的,但可以使用任何其他大型html文件完成。
// some imports shown in case its an import mixup
import javax.swing.text.AttributeSet;
import javax.swing.text.BadLocationException;
import javax.swing.text.ChangedCharSetException;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTMLEditorKit.Parser;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
// Shows whats in an element, recursively
public void printElement(HTMLDocument htmlDoc, Element element)
throws BadLocationException
{
AttributeSet attributes = element.getAttributes();
System.out.println("element: '" + element.toString().trim() + "', name: '" + element.getName() + "', children: " + element.getElementCount() + ", attributes: " + attributes.getAttributeCount() + ", leaf: " + element.isLeaf());
Enumeration attrEnum = attributes.getAttributeNames();
while (attrEnum.hasMoreElements())
{
Object attr = attrEnum.nextElement();
System.out.println("\tAttribute: '" + attr + "', Val: '" + attributes.getAttribute(attr) + "'");
if (attr == StyleConstants.NameAttribute
&& attributes.getAttribute(StyleConstants.NameAttribute) == HTML.Tag.CONTENT)
{
int startOffset = element.getStartOffset();
int endOffset = element.getEndOffset();
int length = endOffset - startOffset;
System.out.printf("\t\tContent (%d-%d): '%s'\n", startOffset, endOffset, htmlDoc.getText(startOffset, length).trim());
}
}
for (int i = 0; i < element.getElementCount(); i++)
{
Element child = element.getElement(i);
printElement(htmlDoc, child);
}
}
public void tryParse(String filename)
throws FileNotFoundException, IOException, BadLocationException
{
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
Parser parser = new ParserDelegator();
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
ParserCallback callback2 = htmlDoc.getReader(0);
ParserCallback callback1 =
new HTMLEditorKit.ParserCallback()
{
};
parser.parse(in, callback2, true);
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null)
printElement(htmlDoc, element);
in.close();
}
在上面的测试中,如果我使用callback1或callback2,结果会有所不同。甚至更奇怪,如果我用适当的函数填充回调并让它们输出一些东西,它们表明解析器确实处理整个网站,但ElementIterator仍然没有全部。
我也尝试使用htmlKit.read()而不是parser.parse(),但它仍然不起作用。
虽然我现在通过使用解析器回调函数(此处未显示)得到我想要的结果,但我仍然想知道为什么ElementIterator在我以后需要的时候没有按预期工作,所以我想知道这里是否有人有经验使用ElementIterator并可以回答。
更新:完整的Java Source上传到这里: http://home.snafu.de/tilman/tmp/Main.java
答案 0 :(得分:1)
使用here看到的方法,我没有注意到您描述的问题。我添加了println()
,所有元素似乎都在那里。
附录:我不确定你的tryParse()
是如何失败的,但你的printElement()
似乎与main()
有关:
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.Enumeration;
import javax.swing.text.AttributeSet;
import javax.swing.text.BadLocationException;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
/** @see https://stackoverflow.com/questions/2882782 */
public class NewMain {
public static void main(String args[]) throws Exception {
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
htmlKit.read(new BufferedReader(new FileReader("test.html")), htmlDoc, 0);
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null) {
printElement(htmlDoc, element);
}
}
private static void printElement(HTMLDocument htmlDoc, Element element)
throws BadLocationException {
AttributeSet attrSet = element.getAttributes();
System.out.println(""
+ "Element: '" + element.toString().trim()
+ "', name: '" + element.getName()
+ "', children: " + element.getElementCount()
+ ", attributes: " + attrSet.getAttributeCount()
+ ", leaf: " + element.isLeaf());
Enumeration attrNames = attrSet.getAttributeNames();
while (attrNames.hasMoreElements()) {
Object attr = attrNames.nextElement();
System.out.println(" Attribute: '" + attr + "', Value: '"
+ attrSet.getAttribute(attr) + "'");
Object tag = attrSet.getAttribute(StyleConstants.NameAttribute);
if (attr == StyleConstants.NameAttribute
&& tag == HTML.Tag.CONTENT) {
int startOffset = element.getStartOffset();
int endOffset = element.getEndOffset();
int length = endOffset - startOffset;
System.out.printf(" Content (%d-%d): '%s'\n", startOffset,
endOffset, htmlDoc.getText(startOffset, length).trim());
}
}
}
}