我的hdd上有多个HTML文件要用Jsoup解析。 我已经能够解析一个文件但不能解析多个文件。 我想解析一个文件夹的所有文件。
我编写了这段代码,它从html文件中提取文本(在某些ID中)(在文件夹“C:/ html”中名为“file.htm”):
package jsouptest;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static void main(String[] args) {
Document doc;
try{
File input = new File("C:/html/file.htm");
doc = Jsoup.parse(input, "UTF-8", "");
Elements ids = doc.select("div[id^=desk] p");
for (Element id : ids){
System.out.println("\n"+id.text());
}
}catch(IOException e){
}
}
}
如何将此代码应用于“C:/ html”文件夹中的所有文件? 感谢
答案 0 :(得分:0)
提取代码以解析方法中的html;列出目录的内容并为每个文件调用解析
File input = new File("C:/html");
File[] st = input.listFiles();
for (int i = 0; i < st.length; i++) {
if(st[i].isFile()){//other condition like name ends in html
parse(st[i]);
}
}
因此您的代码应如下所示:
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static void main(String[] args) {
File input = new File("C:/html");
File[] st = input.listFiles();
for (int i = 0; i < st.length; i++) {
if(st[i].isFile()){//other condition like name ends in html
parse(st[i]);
}
}
}
private static void parse(File input ) {
Document doc;
try{
doc = Jsoup.parse(input, "UTF-8", "");
Elements ids = doc.select("div[id^=desk] p");
for (Element id : ids){
System.out.println("\n"+id.text());
}
}catch(IOException e){
}
}
}
答案 1 :(得分:0)
我编写了程序来读取给定路径的文件夹和内部文件夹,并将结果写入csv
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class fixingCode {
public static void main(String[] args) {
FileWriter writer = null;
System.out.println("--------------------------Program started--------------------------");
File input = new File(
"C:\\My Web Sites\\\\library\\math");//reading file from parent folder
try {
writer = new FileWriter("c:\\Temp\\results.csv");//writing file on path
Process(input, writer);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//
System.out.println("--------------------------Program End--------------------------");
}
static int spc_count = -1;
static void Process(File aFile, FileWriter writer) {
spc_count++;
Document doc = null;
String spcs = "";
try {
//
for (int i = 0; i < spc_count; i++)
spcs += " ";
if (aFile.isFile()) {
System.out.println(spcs + "[FILE] " + aFile.getName());
} else if (aFile.isDirectory()) {
//
System.out.println(spcs + "[DIR] " + aFile.getName());
//
File[] listOfFiles = aFile.listFiles();
//
File[] st = listOfFiles;
//
for (int i = 0; i < st.length; i++) {
if (st[i].isFile()) {// other condition like name
// ends in
doc = Jsoup.parse(st[i], null);
// get page title
String title = doc.title();
System.out.println("title : " + "[" + i + "]" + title);
//
String ownText = doc.body().ownText();
String text = doc.body().text();
//
// System.out.println("ownText" + ownText + "\n");
System.out.println("text" + text);
//
writer.append("title : " + "[" + i + "]");
writer.append(',');
writer.append(title);
writer.append('\n');
/*
* writer.append("ownText"); writer.append(',');
* writer.append(ownText); writer.append('\n');
*/
writer.append("text : " + "[" + i + "]");
writer.append(',');
writer.append(text);
writer.append('\n');
}
//
//
if (listOfFiles != null) {
//
for (int j = 0; j < listOfFiles.length; j++)
Process(listOfFiles[j], writer);
} else {
System.out.println(spcs + " [ACCESS DENIED]");
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
spc_count--;
}
}