我正在使用apache tika解析word文档。但它正在提取整个内容,我想分别提取标题和正常文本。我正在尝试的代码。
public class TextParser
{
public static void main(String[] args) throws IOException,SAXException,
TikaException {
//detecting the file type
BodyContentHandler handler = new BodyContentHandler();
// System.out.println("output"+handler.toString());
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(new File("C:\\TikaExamples\\example.txt"));
ParseContext pcontext=new ParseContext();
//Text document parser
TXTParser textParser = new TXTParser();
textParser.parse(inputstream, handler, metadata,pcontext);
System.out.println("Contents of the document:" + handler.toString());
System.out.println("Metadata of the document:");
String[] metadataNames = metadata.names();
for(String name : metadataNames) {
System.out.println(name + " : " + metadata.get(name));
}
}
}