我正在尝试阅读Docx文件的内容,并使用Apache POI api将它们保存在数据库中。我能够读取纯文本并提取图片,但是当涉及从一些OLE公式编辑器中读取嵌入的方程时,我无法找到一种方法来读取它们。我尝试了很多解决方案,但我无法读取方程式。以下是我的代码:
public class TestMyCode {
public static void main(String[] args) throws InvalidFormatException,
IOException {
TestMyCode t = new TestMyCode();
String fileSelected = "testTextBox.docx";
File file = new File(fileSelected);
FileInputStream fis = new FileInputStream(file.getAbsolutePath());
System.out.println(t.extractDataDocx(fis));
}
public String extractDataDocx(FileInputStream fis) throws IOException,
InvalidFormatException {
String str = "";
XWPFDocument wordDoc = new XWPFDocument(OPCPackage.open(fis));
XWPFWordExtractor we = new XWPFWordExtractor(wordDoc);
System.out.println(we.getMetadataTextExtractor());
for (XWPFParagraph p : wordDoc.getParagraphs()) {
printContentsOfTextBox(p);
}
return str;
}
private void printContentsOfTextBox(XWPFParagraph paragraph) {
// Also extract any paragraphs embedded in text boxes:
XmlObject[] textBoxObjects = paragraph
.getCTP()
.selectPath(
"declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p");
for (int i = 0; i < textBoxObjects.length; i++) {
XWPFParagraph embeddedPara = null;
try {
XmlObject[] paraObjects = textBoxObjects[i]
.selectChildren(new QName(
"http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"p"));
for (int j = 0; j < paraObjects.length; j++) {
embeddedPara = new XWPFParagraph(
CTP.Factory.parse(paraObjects[j].xmlText()),
paragraph.getBody());
// Here you have your paragraph;
System.out.println(embeddedPara.getText());
}
} catch (XmlException e) {
// handle
}
}
}
}