使用jsoup离线保存网页

时间:2016-06-24 14:22:38

标签: java jsoup

我正在尝试更改源和图像链接的引用,以便在本地查看它们,如“另存为”我是否保存了名称为“w3schools_files”的文件,但输出文件仍然不是相同。通过Wireshark捕获html文件。

public static void main(String[] args) throws IOException {


     String sub,i,imgaddr,linkaddr,website,impaddr;
     website="http://www.w3schools.com";
     File input = new File("E:/w3schools.html");
  Document doc = Jsoup.parse(input, "UTF-8");

    Elements images;
    images = doc.select("img");
    //extract images and links and css files 
    for (Element image : images) {

        i = image.attr("src");
        sub = i.substring(i.lastIndexOf("/"));
        imgaddr = "./" + "w3schools_files" + sub;
        image.attr("src", imgaddr);

    }
    Elements links = doc.select("a");
    for (Element link : links) {

        i = link.attr("href");

        if (i.startsWith("/")) {
            linkaddr = website + i;

            link.attr("href", linkaddr);

        }
    }
    Elements imports = doc.select("link[href]");

    for (Element imp : imports) {
        String relat = imp.attr("rel");
        if (relat.equals("stylesheet")) {
            i = imp.attr("href");
            sub = i.substring(i.lastIndexOf("/"));
            impaddr = "./" + "w3schools_files" + sub;
            imp.attr("src", impaddr);
            imp.attr("src", impaddr);

        }
        i = imp.attr("href");
        sub = i.substring(i.lastIndexOf("/"));

        if (i.endsWith(".ico")) {

            String fav = website + sub;
            imp.attr("href", fav);

        } else {
            impaddr = "./" + "w3schools_files" + sub;
            imp.attr("src", impaddr);

        }//writing back in  files
        BufferedWriter htmlwriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(input), "UTF-8"));
        htmlwriter.write(doc.toString())   
    }

}

0 个答案:

没有答案