Jsoup:如何从本地驱动器解析多个HTML文件?

时间:2014-10-16 14:50:30

标签: html jsoup

我的hdd上有多个HTML文件要用Jsoup解析。 我已经能够解析一个文件但不能解析多个文件。 我想解析一个文件夹的所有文件。

我编写了这段代码,它从html文件中提取文本(在某些ID中)(在文件夹“C:/ html”中名为“file.htm”):

package jsouptest;

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main {

    public static void main(String[] args) {
        Document doc;

        try{

            File input = new File("C:/html/file.htm");

            doc = Jsoup.parse(input, "UTF-8", "");


            Elements ids = doc.select("div[id^=desk] p");

            for (Element id : ids){

                System.out.println("\n"+id.text());

            }

        }catch(IOException e){

        }

    }

}

如何将此代码应用于“C:/ html”文件夹中的所有文件? 感谢

2 个答案:

答案 0 :(得分:0)

提取代码以解析方法中的html;列出目录的内容并为每个文件调用解析

   File input = new File("C:/html");
   File[] st = input.listFiles();
   for (int i = 0; i < st.length; i++) {
          if(st[i].isFile()){//other condition like name ends in html
                 parse(st[i]);
          }
   }

因此您的代码应如下所示:

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main {

    public static void main(String[] args) {
        File input = new File("C:/html");
        File[] st = input.listFiles();
        for (int i = 0; i < st.length; i++) {
            if(st[i].isFile()){//other condition like name ends in html
                parse(st[i]);
            }
        }

    }

    private static void parse(File input ) {
        Document doc;

        try{

            doc = Jsoup.parse(input, "UTF-8", "");


            Elements ids = doc.select("div[id^=desk] p");

            for (Element id : ids){

                System.out.println("\n"+id.text());

            }

        }catch(IOException e){

        }
    }
}

答案 1 :(得分:0)

我编写了程序来读取给定路径的文件夹和内部文件夹,并将结果写入csv

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class fixingCode {

    public static void main(String[] args) {
        FileWriter writer = null;

        System.out.println("--------------------------Program started--------------------------");

        File input = new File(
                "C:\\My Web Sites\\\\library\\math");//reading file from parent folder 

        try {
            writer = new FileWriter("c:\\Temp\\results.csv");//writing file on path
            Process(input, writer);

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {

            try {

                writer.flush();
                writer.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        //

        System.out.println("--------------------------Program End--------------------------");
    }

    static int spc_count = -1;

    static void Process(File aFile, FileWriter writer) {
        spc_count++;
        Document doc = null;
        String spcs = "";

        try {

            //
            for (int i = 0; i < spc_count; i++)
                spcs += " ";
            if (aFile.isFile()) {
                System.out.println(spcs + "[FILE] " + aFile.getName());
            } else if (aFile.isDirectory()) {
                //
                System.out.println(spcs + "[DIR] " + aFile.getName());
                //
                File[] listOfFiles = aFile.listFiles();
                //
                File[] st = listOfFiles;

                //
                for (int i = 0; i < st.length; i++) {
                    if (st[i].isFile()) {// other condition like name
                                            // ends in

                        doc = Jsoup.parse(st[i], null);

                        // get page title
                        String title = doc.title();
                        System.out.println("title : " + "[" + i + "]" + title);
                        //
                        String ownText = doc.body().ownText();
                        String text = doc.body().text();
                        //
                        // System.out.println("ownText" + ownText + "\n");
                        System.out.println("text" + text);
                        //

                        writer.append("title : " + "[" + i + "]");
                        writer.append(',');
                        writer.append(title);
                        writer.append('\n');

                        /*
                         * writer.append("ownText"); writer.append(',');
                         * writer.append(ownText); writer.append('\n');
                         */

                        writer.append("text : " + "[" + i + "]");
                        writer.append(',');
                        writer.append(text);
                        writer.append('\n');
                    }
                    //
                    //
                    if (listOfFiles != null) {
                        //
                        for (int j = 0; j < listOfFiles.length; j++)
                            Process(listOfFiles[j], writer);
                    } else {
                        System.out.println(spcs + " [ACCESS DENIED]");
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        spc_count--;
    }

}