public Links visitAndGetNextLinks(Page page) {
String url = page.getUrl();
if (Pattern.matches("http://bbs.gter.net/forum-1033-.*", url)) {
Links nextLinks=new Links();
nextLinks.addAllFromDocument(page.getDoc(),"http://bbs.gter.net/thread.*");
System.out.println(nextLinks+"\n");
return nextLinks;
}else{
byte[] content=page.getContent();
try {
FileUtils.writeFileWithParent("/Users/gary/Dropbox/hooom/program/html"+id.incrementAndGet()+".txt", content);
System.out.println("save page "+page.getUrl());
} catch (IOException ex) {
ex.printStackTrace();
}
}
return null;
}
运行上面的代码之后,这就是我所拥有的......
“无法解析查询'http://bbs.gter.net/thread。':'://bbs.gter.net/thread。'的意外令牌”
我该怎么办?
答案 0 :(得分:0)
你应该这样做:
RegexRule regexRule = new RegexRule();
{
regexRule.addRule("http://bbs.gter.net/thread.*");
}
public Links visitAndGetNextLinks(Page page) {
String url = page.getUrl();
if (Pattern.matches("http://bbs.gter.net/forum-1033-.*", url)) {
Links nextLinks=new Links();
nextLinks.addAllFromDocument(page.getDoc(),regexRule);
System.out.println(nextLinks+"\n");
return nextLinks;
}else{
byte[] content=page.getContent();
try {
FileUtils.writeFileWithParent("/Users/gary/Dropbox/hooom/program/html"+id.incrementAndGet()+".txt", content);
System.out.println("save page "+page.getUrl());
} catch (IOException ex) {
ex.printStackTrace();
}
}
return null;
}