使用Crawler4j将Arraylist打印到HTML文件?

时间:2014-03-13 17:35:46

标签: java arraylist crawler4j

该计划的基础知识; 根据用户在Controller(main)中指定的PerentUrl和Keyword运行webcrawler。如果在页面文本中找到关键字,则会将Url保存到数组列表中;

ArrayList UrlHits = new ArrayList();

爬网完成后,程序将调用main中WriteFile类的方法来编写包含所有UrlHits的html文件。

    WriteFile f = new WriteFile();
    f.openfile(Search);
    f.StartHtml();
    f.addUrl(UrlHits);
    f.EndHtml();
    f.closeFile();

除了f.addUrl以外的所有工作都正常,创建一个具有正确名称和目录的html文件。但是ArrayList中没有任何字符串输出到文件。

public static void main(String[] args) throws Exception {

    RobotstxtConfig robotstxtConfig2 = new RobotstxtConfig();

    String crawlStorageFolder = "/Users/Jake/Documents/sem 2/FYP/Crawler/TestData";
    int numberOfCrawlers = 1;

    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);

    config.setMaxDepthOfCrawling(21);
    config.setMaxPagesToFetch(24);

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    Scanner perentUrl = new Scanner(System.in);
    System.out.println("Enter full perant Url... example. http://www.domain.co.uk/");
    String Url = perentUrl.nextLine();

    Scanner keyword = new Scanner(System.in);
    System.out.println("Enter search term... example. Pies");
    String Search = keyword.nextLine();

    System.out.println("Searching domain :" + Url);
    System.out.println("Keyword:" + Search);

    ArrayList<String> DomainsToInv = new ArrayList<String>();
    ArrayList<String> SearchTerms = new ArrayList<String>();
    ArrayList<String> UrlHits = new ArrayList<String>();

    DomainsToInv.add(Url);
    SearchTerms.add(Search);

    controller.addSeed(Url);

    controller.setCustomData(DomainsToInv);
    controller.setCustomData(SearchTerms);
    controller.start(Crawler.class, numberOfCrawlers);

    WriteFile f = new WriteFile();
    f.openfile(Search);
    f.StartHtml();
    f.addUrl(UrlHits);
    f.EndHtml();
    f.closeFile();
}

}

public class Crawler扩展了WebCrawler {

@Override
public void visit(Page page) {

    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();
    String domain = page.getWebURL().getDomain();
    String path = page.getWebURL().getPath();
    String subDomain = page.getWebURL().getSubDomain();
    String parentUrl = page.getWebURL().getParentUrl();
    String anchor = page.getWebURL().getAnchor();

    System.out.println("Docid: " + docid);
    System.out.println("URL: " + url);
    System.out.println("Domain: '" + domain + "'");
    System.out.println("Sub-domain: '" + subDomain + "'");
    System.out.println("Path: '" + path + "'");
    System.out.println("Parent page: " + parentUrl);
    System.out.println("Anchor text: " + anchor);

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        List<WebURL> links = htmlParseData.getOutgoingUrls();

        System.out.println("Text length: " + text.length());
        System.out.println("Html length: " + html.length());
        System.out.println("Number of outgoing links: " + links.size());
    }

    Header[] responseHeaders = page.getFetchResponseHeaders();
    if (responseHeaders != null) {
        System.out.println("Response headers:");
        for (Header header : responseHeaders) {
            System.out.println("\t" + header.getName() + ": " + header.getValue());
        }
    }
    System.out.println("=============");

    ArrayList<String> SearchTerms = (ArrayList<String>) this.getMyController().getCustomData();
    ArrayList<String> UrlHits = (ArrayList<String>) this.getMyController().getCustomData();

    for (String Keyword : SearchTerms) {

        System.out.println("Searching Keyword: " + Keyword);

        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

        int KeywordCounter = 0;
        String pagetext = htmlParseData.getText();
        Pattern pattern = Pattern.compile(Keyword);
        Matcher match1 = pattern.matcher(pagetext);

        if (match1.find()) {
            while (match1.find()) {
                KeywordCounter++;
            }
            System.out.println("FOUND " + Keyword + " in page text. KeywordCount: " + KeywordCounter);

            UrlHits.add(url);
            for (int i = 0; i < UrlHits.size(); i++) {
                System.out.print(UrlHits.get(i) + "\n");

                System.out.println("=============");
            }

        } else {
            System.out.println("Keyword search was unsuccesful");

            System.out.println("=============");
        }

    }

}

}

public class WriteFile {

private Formatter x;

public void openfile(String keyword) {

    try {
        x = new Formatter(keyword + ".html");
    } catch (Exception e) {

        System.out.println("ERROR");
    }
}

public void StartHtml() {
    x.format("%s %n %s %n %s %n %s %n %s %n ", "<html>", "<head>", "</head>", "<body>", "<center>");
}

public void addUrl(ArrayList<String> UrlHits) {

    for (String list : UrlHits) {
        x.format("%s%s%s%s%s%n%s%n", "<a href=\"", list, "\" target=\"_blank\">", list, "</a>", "<br>");
    }
}

public void EndHtml() {
    x.format("%s %n %s %n %s %n", "</center>", "</body>", "</html>");
}

public void closeFile() {
    x.close();
}

}

代码之外的类标题的道歉阻止它有点繁琐。我尝试了一些不同的“for”语句来获取输出数组列表的方法,但似乎没有它。字符串被添加到数组列表中,因为我可以在main中使用for循环调用它们。但是当我将数组列表传递给方法addUrl时,它会出现下蹲。是否有更简单的方法来使用格式化程序和.format?

来使用arraylists

谢谢你的帮助

0 个答案:

没有答案