无法在Apache POI中按Word文档(docx)的顺序读取所有内容

时间:2019-08-30 12:26:50

标签: java ms-word apache-poi

我一直试图从Word文档中读取所有内容(包括表格,图片,段落)。我可以使用getBodyElementsIterator()读取表和段落,但是不读取文档中存在的图片。 尽管我可以使用getAllPictures()单独读取图片,但是我需要按顺序读取所有内容。

我尝试在getBodyElementsIterator()中循环时寻找XWPFPicture实例,但是我找不到任何图像实例。

Iterator<IBodyElement> iter = xdoc.getBodyElementsIterator();
           while (iter.hasNext()) {
               IBodyElement elem = iter.next();
               if (elem instanceof XWPFParagraph) {
                  System.out.println("para - "+elem.getClass());
               } else if (elem instanceof XWPFTable) {
                  System.out.println("table - "+elem);
               } else if (elem instanceof XWPFPictureData){
                  System.out.println("picture - "+elem);
               } else {
                  System.out.println("else - "+elem);
               }  
            }

这是我得到的输出。

paraorg.apache.poi.xwpf.usermodel.XWPFParagraph@4d3167f4
paraorg.apache.poi.xwpf.usermodel.XWPFParagraph@ed9d034
tableorg.apache.poi.xwpf.usermodel.XWPFTable@6121c9d6
paraorg.apache.poi.xwpf.usermodel.XWPFParagraph@87f383f
paraorg.apache.poi.xwpf.usermodel.XWPFParagraph@4eb7f003

它包含段落和表格,但不包含任何图片

1 个答案:

答案 0 :(得分:1)

已经在评论中告知,如何在apache poi中按单词文档(docx)的顺序读取所有内容的问题范围太广,无法在此处回答。 *.docxZIP文件格式的Office Open XML存档。它包含文档正文的document.xml。这是非常复杂的XML,需要遍历。但是该document.xml可能包含对*.docx ZIP存档中其他资源的引用,然后也需要遍历。

我可以提供的是该遍历过程的外观模板。它从XWPFDocument开始,首先遍历其中的所有IBodyElement。根据找到的IBodyElement类型,它会进一步遍历过程。

import java.io.FileInputStream;

import org.apache.poi.xwpf.usermodel.*;

import java.util.List;

public class WordReadAllContent {

 static void traversePictures(List<XWPFPicture> pictures) throws Exception {
  for (XWPFPicture picture : pictures) {
   System.out.println(picture);
   XWPFPictureData pictureData = picture.getPictureData();
   System.out.println(pictureData);
  }
 }

 static void traverseRunElements(List<IRunElement> runElements) throws Exception {
  for (IRunElement runElement : runElements) {
   if (runElement instanceof XWPFFieldRun) {
    XWPFFieldRun fieldRun = (XWPFFieldRun)runElement;
    System.out.println(fieldRun.getClass().getName());
    System.out.println(fieldRun);
    traversePictures(fieldRun.getEmbeddedPictures());
   } else if (runElement instanceof XWPFHyperlinkRun) {
    XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun)runElement;
    System.out.println(hyperlinkRun.getClass().getName());
    System.out.println(hyperlinkRun);
    traversePictures(hyperlinkRun.getEmbeddedPictures());
   } else if (runElement instanceof XWPFRun) {
    XWPFRun run = (XWPFRun)runElement;
    System.out.println(run.getClass().getName());
    System.out.println(run);
    traversePictures(run.getEmbeddedPictures());
   } else if (runElement instanceof XWPFSDT) {
    XWPFSDT sDT = (XWPFSDT)runElement;
    System.out.println(sDT);
    System.out.println(sDT.getContent());
    //ToDo: The SDT may have traversable content too.
   }
  }
 }

 static void traverseTableCells(List<ICell> tableICells) throws Exception {
  for (ICell tableICell : tableICells) {
   if (tableICell instanceof XWPFSDTCell) {
    XWPFSDTCell sDTCell = (XWPFSDTCell)tableICell;
    System.out.println(sDTCell);
    //ToDo: The SDTCell may have traversable content too.
   } else if (tableICell instanceof XWPFTableCell) {
    XWPFTableCell tableCell = (XWPFTableCell)tableICell;
    System.out.println(tableCell);
    traverseBodyElements(tableCell.getBodyElements());
   }
  }
 }

 static void traverseTableRows(List<XWPFTableRow> tableRows) throws Exception {
  for (XWPFTableRow tableRow : tableRows) {
   System.out.println(tableRow);
   traverseTableCells(tableRow.getTableICells());
  }
 }

 static void traverseBodyElements(List<IBodyElement> bodyElements) throws Exception {
  for (IBodyElement bodyElement : bodyElements) {
   if (bodyElement instanceof XWPFParagraph) {
    XWPFParagraph paragraph = (XWPFParagraph)bodyElement;
    System.out.println(paragraph);
    traverseRunElements(paragraph.getIRuns());
   } else if (bodyElement instanceof XWPFSDT) {
    XWPFSDT sDT = (XWPFSDT)bodyElement;
    System.out.println(sDT);
    System.out.println(sDT.getContent());
    //ToDo: The SDT may have traversable content too.
   } else if (bodyElement instanceof XWPFTable) {
    XWPFTable table = (XWPFTable)bodyElement;
    System.out.println(table);
    traverseTableRows(table.getRows());
   }
  }
 }

 public static void main(String[] args) throws Exception {

  String inFilePath = "./WordDocument.docx";

  XWPFDocument document = new XWPFDocument(new FileInputStream(inFilePath));
  traverseBodyElements(document.getBodyElements());

  document.close();
 }

}

这是工作草案。我确定,我忘了一些东西。