该计划的基础知识; 根据用户在Controller(main)中指定的PerentUrl和Keyword运行webcrawler。如果在页面文本中找到关键字,则会将Url保存到数组列表中;
ArrayList UrlHits = new ArrayList();
爬网完成后,程序将调用main中WriteFile类的方法来编写包含所有UrlHits的html文件。
WriteFile f = new WriteFile();
f.openfile(Search);
f.StartHtml();
f.addUrl(UrlHits);
f.EndHtml();
f.closeFile();
除了f.addUrl以外的所有工作都正常,创建一个具有正确名称和目录的html文件。但是ArrayList中没有任何字符串输出到文件。
public static void main(String[] args) throws Exception {
RobotstxtConfig robotstxtConfig2 = new RobotstxtConfig();
String crawlStorageFolder = "/Users/Jake/Documents/sem 2/FYP/Crawler/TestData";
int numberOfCrawlers = 1;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setMaxDepthOfCrawling(21);
config.setMaxPagesToFetch(24);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
Scanner perentUrl = new Scanner(System.in);
System.out.println("Enter full perant Url... example. http://www.domain.co.uk/");
String Url = perentUrl.nextLine();
Scanner keyword = new Scanner(System.in);
System.out.println("Enter search term... example. Pies");
String Search = keyword.nextLine();
System.out.println("Searching domain :" + Url);
System.out.println("Keyword:" + Search);
ArrayList<String> DomainsToInv = new ArrayList<String>();
ArrayList<String> SearchTerms = new ArrayList<String>();
ArrayList<String> UrlHits = new ArrayList<String>();
DomainsToInv.add(Url);
SearchTerms.add(Search);
controller.addSeed(Url);
controller.setCustomData(DomainsToInv);
controller.setCustomData(SearchTerms);
controller.start(Crawler.class, numberOfCrawlers);
WriteFile f = new WriteFile();
f.openfile(Search);
f.StartHtml();
f.addUrl(UrlHits);
f.EndHtml();
f.closeFile();
}
}
public class Crawler扩展了WebCrawler {
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Domain: '" + domain + "'");
System.out.println("Sub-domain: '" + subDomain + "'");
System.out.println("Path: '" + path + "'");
System.out.println("Parent page: " + parentUrl);
System.out.println("Anchor text: " + anchor);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
Header[] responseHeaders = page.getFetchResponseHeaders();
if (responseHeaders != null) {
System.out.println("Response headers:");
for (Header header : responseHeaders) {
System.out.println("\t" + header.getName() + ": " + header.getValue());
}
}
System.out.println("=============");
ArrayList<String> SearchTerms = (ArrayList<String>) this.getMyController().getCustomData();
ArrayList<String> UrlHits = (ArrayList<String>) this.getMyController().getCustomData();
for (String Keyword : SearchTerms) {
System.out.println("Searching Keyword: " + Keyword);
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
int KeywordCounter = 0;
String pagetext = htmlParseData.getText();
Pattern pattern = Pattern.compile(Keyword);
Matcher match1 = pattern.matcher(pagetext);
if (match1.find()) {
while (match1.find()) {
KeywordCounter++;
}
System.out.println("FOUND " + Keyword + " in page text. KeywordCount: " + KeywordCounter);
UrlHits.add(url);
for (int i = 0; i < UrlHits.size(); i++) {
System.out.print(UrlHits.get(i) + "\n");
System.out.println("=============");
}
} else {
System.out.println("Keyword search was unsuccesful");
System.out.println("=============");
}
}
}
}
public class WriteFile {
private Formatter x;
public void openfile(String keyword) {
try {
x = new Formatter(keyword + ".html");
} catch (Exception e) {
System.out.println("ERROR");
}
}
public void StartHtml() {
x.format("%s %n %s %n %s %n %s %n %s %n ", "<html>", "<head>", "</head>", "<body>", "<center>");
}
public void addUrl(ArrayList<String> UrlHits) {
for (String list : UrlHits) {
x.format("%s%s%s%s%s%n%s%n", "<a href=\"", list, "\" target=\"_blank\">", list, "</a>", "<br>");
}
}
public void EndHtml() {
x.format("%s %n %s %n %s %n", "</center>", "</body>", "</html>");
}
public void closeFile() {
x.close();
}
}
代码之外的类标题的道歉阻止它有点繁琐。我尝试了一些不同的“for”语句来获取输出数组列表的方法,但似乎没有它。字符串被添加到数组列表中,因为我可以在main中使用for循环调用它们。但是当我将数组列表传递给方法addUrl时,它会出现下蹲。是否有更简单的方法来使用格式化程序和.format?
来使用arraylists谢谢你的帮助