我的HTML包含以下格式的标记:
<div class="author"><a href="/user/1" title="View user profile.">Apple</a> - October 22, 2009 - 01:07</div>
我想从每个标签中提取日期,“2009年10月22日 - 01:07”
我已经实现了javax.swing.text.html.HTMLEditorKit.ParserCallback,如下所示:
class HTMLParseListerInner extends HTMLEditorKit.ParserCallback {
private ArrayList<String> foundDates = new ArrayList<String>();
private boolean isDivLink = false;
public void handleText(char[] data, int pos) {
if(isDivLink)
foundDates.add(new String(data)); // Extracts "Apple" instead of the date.
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
String divValue = (String)a.getAttribute(HTML.Attribute.CLASS);
if (t.toString() == "div" && divValue != null && divValue.equals("author"))
isDivLink = true;
}
}
但是,上面的解析器返回“Apple”,它位于标记内的超链接内。如何修复解析器以提取日期?
答案 0 :(得分:1)
覆盖handleEndTag
并检查"a"
?
但是,这个HTML解析器是从90年代早期开始的,并且没有很好地指定这些方法。
答案 1 :(得分:0)
import java.io.*;
import java.util.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
public class ParserCallbackDiv extends HTMLEditorKit.ParserCallback
{
private boolean isDivLink = false;
private String divText;
public void handleEndTag(HTML.Tag tag, int pos)
{
if (tag.equals(HTML.Tag.DIV))
{
System.out.println( divText );
isDivLink = false;
}
}
public void handleStartTag(HTML.Tag tag, MutableAttributeSet a, int pos)
{
if (tag.equals(HTML.Tag.DIV))
{
String divValue = (String)a.getAttribute(HTML.Attribute.CLASS);
if ("author".equals(divValue))
isDivLink = true;
}
}
public void handleText(char[] data, int pos)
{
divText = new String(data);
}
public static void main(String[] args)
throws IOException
{
String file = "<div class=\"author\"><a href=\"/user/1\"" +
"title=\"View user profile.\">Apple</a> - October 22, 2009 - 01:07</div>";
StringReader reader = new StringReader(file);
ParserCallbackDiv parser = new ParserCallbackDiv();
try
{
new ParserDelegator().parse(reader, parser, true);
}
catch (IOException e)
{
System.out.println(e);
}
}
}