使用Apache POI解析文本文档并在其中搜索关键字
答案 0 :(得分:0)
我只是简单地创建简历解析器,解析每个文档(.doc,.docx)以在文件夹或子文件夹中查找给定的关键字。请阅读HWPFDocument的Java-Doc。
这是代码
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.poi.hwpf.HWPFDocument;
public class ResumeParser {
HWPFDocument document;
FileInputStream fileInputStream;
String rootFolderPath;
Map<String, List<String>> displayContent = new HashMap<String, List<String>>();
List<String> keywordList;
public ResumeParser(String rootFolderPath, List<String> keywordList) throws IOException {
this.rootFolderPath = rootFolderPath;
this.keywordList = keywordList;
startParsing();
}
public void startParsing() throws IOException {
createDisplayContent(rootFolderPath);
printContent(displayContent);
}
/**
* Creates the display content it parse the each file under given folder and
* also in sub folder. folder
*
* @param rootFolderPath
* the root folder path
*/
public void createDisplayContent(String rootFolderPath) {
File fileL = new File(rootFolderPath);
File[] fileArrL = fileL.listFiles();
for (int i = 0; i < fileArrL.length; i++) {
File fileTempL = fileArrL[i];
if (fileTempL.isFile()) {
String name = fileTempL.getName();
String extension = name.substring(name.indexOf('.') + 1, name.length()).toUpperCase();
if (extension.equals("DOC") || extension.equals("DOCX")) {
parseDocFile(fileTempL.getAbsolutePath());
}
} else if (fileTempL.isDirectory()) {
createDisplayContent(fileTempL.getAbsolutePath());
}
}
}
public void parseDocFile(String filePath) {
try {
fileInputStream = new FileInputStream(filePath);
document = new HWPFDocument(fileInputStream);
String text = document.getText().toString().toUpperCase();
Iterator<String> iteratorL = keywordList.iterator();
List<String> matchKeywordListL = new ArrayList<String>();
String keywordL = "";
while (iteratorL.hasNext()) {
keywordL = iteratorL.next().toUpperCase();
if (text.contains(keywordL)) {
matchKeywordListL.add(keywordL.toLowerCase());
}
}
displayContent.put(filePath, matchKeywordListL);
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
public void printContent(Map<String, List<String>> displayContent) {
Iterator<String> iteratorL = displayContent.keySet().iterator();
String keyL = "";
while (iteratorL.hasNext()) {
keyL = iteratorL.next();
System.out.println("File Name: " + keyL + " Match Keywords: " + displayContent.get(keyL));
}
}
public static void main(String args[]) throws IOException {
List<String> keywordListL = new ArrayList<String>();
keywordListL.add("Java");
keywordListL.add("PHP");
keywordListL.add("Andriod");
keywordListL.add("John");
new ResumeParser("D:\\Doc", keywordListL);
}
}