我已经编写了一个java基本程序,它使用Apache POI将3种文件(ppt,doc,txt)嵌入到Excel工作表中。现在我要以原始格式导出此文件。怎么做?
参考链接为Embed files into Excel using Apache POI。 我已通过此链接制作了节目。
简而言之,我希望在Embedded文件中导出功能。
我使用下面的代码尝试了上述问题,但它不能用于在Excel工作表中导出嵌入式文件:
这是试图解决的代码:
xlsread(file.csv);
那么,如何导出功能实现?
答案 0 :(得分:2)
这部分是How to get pictures with names from an xls file using Apache POI的重复,为此我写了原始的paste。
根据请求,我还添加了一个如何在OLE 1.0 packager的帮助下添加和嵌入的示例 - 与此同时我已将代码添加到POI,因此现在更容易。对于基于OOXML的文件,请查看this answer。
因此代码遍历DrawingPatriarch的所有形状并提取图片和嵌入文件。
我已经将完整的代码 - 而不是代码段 - 添加到了这个答案中,因为我预计下一个“为什么我不能导出这种嵌入”很快就会出现......
package poijartest;
import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.poi.ddf.EscherComplexProperty;
import org.apache.poi.ddf.EscherOptRecord;
import org.apache.poi.ddf.EscherProperty;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hssf.usermodel.HSSFClientAnchor;
import org.apache.poi.hssf.usermodel.HSSFObjectData;
import org.apache.poi.hssf.usermodel.HSSFPatriarch;
import org.apache.poi.hssf.usermodel.HSSFPicture;
import org.apache.poi.hssf.usermodel.HSSFPictureData;
import org.apache.poi.hssf.usermodel.HSSFShape;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFSimpleShape;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.usermodel.AutoShape;
import org.apache.poi.sl.usermodel.ShapeType;
import org.apache.poi.sl.usermodel.Slide;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFPicture;
import org.apache.poi.xssf.usermodel.XSSFPictureData;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTPicture;
/**
* Tested with POI 3.16-beta1
*
* 17.12.2014: original version for
* http://apache-poi.1045710.n5.nabble.com/How-to-get-the-full-file-name-of-a-picture-in-xls-file-td5717205.html
*
* 17.12.2016: added sample/dummy data for
* https://stackoverflow.com/questions/41101012/how-to-export-embeded-file-which-from-excel-using-poi
*/
public class EmbeddedReader {
private File excel_file;
private ImageReader image_reader;
public static void main(String[] args) throws Exception {
File sample = new File("bla.xls");
getSampleEmbedded(sample);
ImageReader ir = new ImageReader(sample);
for (EmbeddedData ed : ir.embeddings) {
System.out.println(ed.filename);
FileOutputStream fos = new FileOutputStream(ed.filename);
IOUtils.copy(ed.is, fos);
fos.close();
}
ir.close();
}
static void getSampleEmbedded(File sample) throws IOException {
HSSFWorkbook wb = new HSSFWorkbook();
int storageId = wb.addOlePackage(getSamplePPT(), "dummy.ppt", "dummy.ppt", "dummy.ppt");
int picId = wb.addPicture(getSamplePng(), HSSFPicture.PICTURE_TYPE_PNG);
HSSFSheet sheet = wb.createSheet();
HSSFPatriarch pat = sheet.createDrawingPatriarch();
HSSFClientAnchor anc = pat.createAnchor(0, 0, 0, 0, 1, 1, 3, 6);
HSSFObjectData od = pat.createObjectData(anc, storageId, picId);
od.setNoFill(true);
wb.write(sample);
wb.close();
}
static byte[] getSamplePng() throws IOException {
ClassLoader cl = Thread.currentThread().getContextClassLoader();
URL imgUrl = cl.getResource("javax/swing/plaf/metal/icons/ocean/directory.gif");
BufferedImage img = ImageIO.read(imgUrl);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ImageIO.write(img, "PNG", bos);
return bos.toByteArray();
}
static byte[] getSamplePPT() throws IOException {
HSLFSlideShow ppt = new HSLFSlideShow();
Slide<?,?> slide = ppt.createSlide();
AutoShape<?,?> sh1 = slide.createAutoShape();
sh1.setShapeType(ShapeType.STAR_32);
sh1.setAnchor(new java.awt.Rectangle(50, 50, 100, 200));
sh1.setFillColor(Color.red);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ppt.write(bos);
ppt.close();
POIFSFileSystem poifs = new POIFSFileSystem(new ByteArrayInputStream(bos.toByteArray()));
poifs.getRoot().setStorageClsid(ClassID.PPT_SHOW);
bos.reset();
poifs.writeFilesystem(bos);
poifs.close();
return bos.toByteArray();
}
public EmbeddedReader(String excel_path) throws IOException {
excel_file = new File(excel_path);
image_reader = new ImageReader(excel_file);
}
public String[] get_file_names() {
ArrayList<String> file_names = new ArrayList<String>();
for (EmbeddedData ed : image_reader.embeddings) {
file_names.add(ed.filename);
}
return file_names.toArray(new String[file_names.size()]);
}
public InputStream get_stream(String file_name) {
InputStream input_stream = null;
for (EmbeddedData ed : image_reader.embeddings) {
if(file_name.equals(ed.filename)) {
input_stream = ed.is;
break;
}
}
return input_stream;
}
static class ImageReader implements Closeable {
EmbeddedExtractor extractors[] = {
new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
};
List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
Workbook wb;
public ImageReader(File excelfile) throws IOException {
try {
wb = WorkbookFactory.create(excelfile);
Sheet receiptImages = wb.getSheet("Receipt images");
if (wb instanceof XSSFWorkbook) {
addSheetPicsAndEmbedds((XSSFSheet)receiptImages);
} else {
addAllEmbedds((HSSFWorkbook)wb);
addSheetPics((HSSFSheet)receiptImages);
}
} catch (Exception e) {
// todo: error handling
}
}
protected void addSheetPicsAndEmbedds(XSSFSheet sheet) throws IOException {
if (sheet == null) return;
XSSFDrawing draw = sheet.createDrawingPatriarch();
for (XSSFShape shape : draw.getShapes()) {
if (!(shape instanceof XSSFPicture)) continue;
XSSFPicture picture = (XSSFPicture)shape;
XSSFPictureData pd = picture.getPictureData();
PackagePart pp = pd.getPackagePart();
CTPicture ctPic = picture.getCTPicture();
String filename = null;
try {
filename = ctPic.getNvPicPr().getCNvPr().getName();
} catch (Exception e) {}
if (filename == null || "".equals(filename)) {
filename = new File(pp.getPartName().toString()).getName();
}
EmbeddedData ed = new EmbeddedData();
ed.filename = fileNameWithoutPath(filename);
ed.is = pp.getInputStream();
embeddings.add(ed);
}
}
protected void addAllEmbedds(HSSFWorkbook hwb) throws IOException {
for (HSSFObjectData od : hwb.getAllEmbeddedObjects()) {
String alternativeName = getAlternativeName(od);
if (od.hasDirectoryEntry()) {
DirectoryNode src = (DirectoryNode)od.getDirectory();
for (EmbeddedExtractor ee : extractors) {
if (ee.canExtract(src)) {
EmbeddedData ed = ee.extract(src);
if (ed.filename == null || ed.filename.startsWith("MBD") || alternativeName != null) {
ed.filename = alternativeName;
}
ed.filename = fileNameWithoutPath(ed.filename);
ed.source = "object";
embeddings.add(ed);
break;
}
}
}
}
}
protected String getAlternativeName(HSSFShape shape) {
EscherOptRecord eor = reflectEscherOptRecord(shape);
if (eor == null) return null;
for (EscherProperty ep : eor.getEscherProperties()) {
if ("groupshape.shapename".equals(ep.getName()) && ep.isComplex()) {
return new String(((EscherComplexProperty)ep).getComplexData(),
Charset.forName("UTF-16LE"));
}
}
return null;
}
protected void addSheetPics(HSSFSheet sheet) {
if (sheet == null) return;
int picIdx=0;
int emfIdx = 0;
HSSFPatriarch patriarch = sheet.getDrawingPatriarch();
if (patriarch == null) return;
// Loop through the objects
for (HSSFShape shape : patriarch.getChildren()) {
if (!(shape instanceof HSSFPicture)) {
continue;
}
HSSFPicture picture = (HSSFPicture) shape;
if (picture.getShapeType() != HSSFSimpleShape.OBJECT_TYPE_PICTURE) continue;
HSSFPictureData pd = picture.getPictureData();
byte pictureBytes[] = pd.getData();
int pictureBytesOffset = 0;
int pictureBytesLen = pictureBytes.length;
String filename = picture.getFileName();
// try to find an alternative name
if (filename == null || "".equals(filename)) {
filename = getAlternativeName(picture);
}
// default to dummy name
if (filename == null || "".equals(filename)) {
filename = "picture"+(picIdx++);
}
filename = filename.trim();
// check for emf+ embedded pdf (poor mans style :( )
// Mac Excel 2011 embeds pdf files with this method.
boolean validFile = true;
if (pd.getFormat() == Workbook.PICTURE_TYPE_EMF) {
validFile = false;
int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes());
if (idxStart != -1) {
int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes());
if (idxEnd != -1) {
pictureBytesOffset = idxStart;
pictureBytesLen = idxEnd-idxStart+6;
validFile = true;
}
} else {
// This shape was not a Mac Excel 2011 embedded pdf file.
// So this is a shape related to a regular embedded object
// Lets update the object filename with the shapes filename
// if the object filename is of format ARGF1234.pdf
EmbeddedData ed_obj = embeddings.get(emfIdx);
Pattern pattern = Pattern.compile("^[A-Z0-9]{8}\\.[pdfPDF]{3}$");
Matcher matcher = pattern.matcher(ed_obj.filename);
if(matcher.matches()) {
ed_obj.filename = filename;
}
emfIdx += 1;
}
}
EmbeddedData ed = new EmbeddedData();
ed.filename = fileNameWithoutPath(filename);
ed.is = new ByteArrayInputStream(pictureBytes, pictureBytesOffset, pictureBytesLen);
if(fileNotInEmbeddings(ed.filename) && validFile) {
embeddings.add(ed);
}
}
}
private static EscherOptRecord reflectEscherOptRecord(HSSFShape shape) {
try {
Method m = HSSFShape.class.getDeclaredMethod("getOptRecord");
m.setAccessible(true);
return (EscherOptRecord)m.invoke(shape);
} catch (Exception e) {
// todo: log ... well actually "should not happen" ;)
return null;
}
}
private String fileNameWithoutPath(String filename) {
int last_index = filename.lastIndexOf("\\");
return filename.substring(last_index + 1);
}
private boolean fileNotInEmbeddings(String filename) {
boolean exists = true;
for(EmbeddedData ed : embeddings) {
if(ed.filename.equals(filename)) {
exists = false;
}
}
return exists;
}
public void close() throws IOException {
Iterator<EmbeddedData> ed = embeddings.iterator();
while (ed.hasNext()) {
ed.next().is.close();
}
wb.close();
}
}
static class EmbeddedData {
String filename;
InputStream is;
String source;
}
static abstract class EmbeddedExtractor {
abstract boolean canExtract(DirectoryNode dn);
abstract EmbeddedData extract(DirectoryNode dn) throws IOException;
protected EmbeddedData extractFS(DirectoryNode dn, String filename) throws IOException {
assert(canExtract(dn));
POIFSFileSystem dest = new POIFSFileSystem();
copyNodes(dn, dest.getRoot());
EmbeddedData ed = new EmbeddedData();
ed.filename = filename;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
dest.writeFilesystem(bos);
dest.close();
ed.is = new ByteArrayInputStream(bos.toByteArray());
return ed;
}
}
static class Ole10Extractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return ClassID.OLE10_PACKAGE.equals(clsId);
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
try {
Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
EmbeddedData ed = new EmbeddedData();
ed.filename = new File(ole10.getFileName()).getName();
ed.is = new ByteArrayInputStream(ole10.getDataBuffer());
return ed;
} catch (Ole10NativeException e) {
throw new IOException(e);
}
}
}
static class PdfExtractor extends EmbeddedExtractor {
static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}");
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (PdfClassID.equals(clsId)
|| dn.hasEntry("CONTENTS"));
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = new EmbeddedData();
ed.is = dn.createDocumentInputStream("CONTENTS");
ed.filename = dn.getName()+".pdf";
return ed;
}
}
static class WordExtractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (ClassID.WORD95.equals(clsId)
|| ClassID.WORD97.equals(clsId)
|| dn.hasEntry("WordDocument"));
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
return extractFS(dn, dn.getName()+".doc");
}
}
static class ExcelExtractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (ClassID.EXCEL95.equals(clsId)
|| ClassID.EXCEL97.equals(clsId)
|| dn.hasEntry("Workbook") /*...*/);
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
return extractFS(dn, dn.getName()+".xls");
}
}
static class FsExtractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
return true;
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
return extractFS(dn, dn.getName()+".dat");
}
}
private static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException {
for (Entry e : src) {
if (e instanceof DirectoryNode) {
DirectoryNode srcDir = (DirectoryNode)e;
DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
destDir.setStorageClsid(srcDir.getStorageClsid());
copyNodes(srcDir, destDir);
} else {
InputStream is = src.createDocumentInputStream(e);
dest.createDocument(e.getName(), is);
is.close();
}
}
}
/**
* Knuth-Morris-Pratt Algorithm for Pattern Matching
* Finds the first occurrence of the pattern in the text.
*/
private static int indexOf(byte[] data, int offset, byte[] pattern) {
int[] failure = computeFailure(pattern);
int j = 0;
if (data.length == 0) return -1;
for (int i = offset; i < data.length; i++) {
while (j > 0 && pattern[j] != data[i]) {
j = failure[j - 1];
}
if (pattern[j] == data[i]) { j++; }
if (j == pattern.length) {
return i - pattern.length + 1;
}
}
return -1;
}
/**
* Computes the failure function using a boot-strapping process,
* where the pattern is matched against itself.
*/
private static int[] computeFailure(byte[] pattern) {
int[] failure = new int[pattern.length];
int j = 0;
for (int i = 1; i < pattern.length; i++) {
while (j > 0 && pattern[j] != pattern[i]) {
j = failure[j - 1];
}
if (pattern[j] == pattern[i]) {
j++;
}
failure[i] = j;
}
return failure;
}
}
答案 1 :(得分:2)
{{1}}
答案 2 :(得分:1)
必需的jar文件列表:
这是我的整个代码实现:
import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.poi.ddf.EscherComplexProperty;
import org.apache.poi.ddf.EscherOptRecord;
import org.apache.poi.ddf.EscherProperty;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hssf.usermodel.HSSFClientAnchor;
import org.apache.poi.hssf.usermodel.HSSFObjectData;
import org.apache.poi.hssf.usermodel.HSSFPatriarch;
import org.apache.poi.hssf.usermodel.HSSFPicture;
import org.apache.poi.hssf.usermodel.HSSFPictureData;
import org.apache.poi.hssf.usermodel.HSSFShape;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFSimpleShape;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.usermodel.AutoShape;
import org.apache.poi.sl.usermodel.Slide;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFPicture;
import org.apache.poi.xssf.usermodel.XSSFPictureData;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTPicture;
public class EmbeddedReader {
public static final OleType OLE10_PACKAGE = new OleType("{0003000C-0000-0000-C000-000000000046}");
public static final OleType PPT_SHOW = new OleType("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
public static final OleType XLS_WORKBOOK = new OleType("{00020841-0000-0000-C000-000000000046}");
public static final OleType TXT_ONLY = new OleType("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
public static final OleType EXCEL97 = new OleType("{00020820-0000-0000-C000-000000000046}");
public static final OleType EXCEL95 = new OleType("{00020810-0000-0000-C000-000000000046}");
public static final OleType WORD97 = new OleType("{00020906-0000-0000-C000-000000000046}");
public static final OleType WORD95 = new OleType("{00020900-0000-0000-C000-000000000046}");
public static final OleType POWERPOINT97 = new OleType("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
public static final OleType POWERPOINT95 = new OleType("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
public static final OleType EQUATION30 = new OleType("{0002CE02-0000-0000-C000-000000000046}");
public static final OleType PdfClassID = new OleType("{B801CA65-A1FC-11D0-85AD-444553540000}");
private File excel_file;
private ImageReader image_reader;
static class OleType {
final String classId;
OleType(String classId) {
this.classId = classId;
}
ClassID getClassID() {
ClassID cls = new ClassID();
byte clsBytes[] = cls.getBytes();
String clsStr = classId.replaceAll("[{}-]", "");
for (int i = 0; i < clsStr.length(); i += 2) {
clsBytes[i / 2] = (byte) Integer.parseInt(
clsStr.substring(i, i + 2), 16);
}
return cls;
}
}
public static void main(String[] args) throws Exception {
File sample = new File("D:\\ole_ppt_in_xls.xls");
ImageReader ir = new ImageReader(sample);
for (EmbeddedData ed : ir.embeddings) {
FileOutputStream fos = new FileOutputStream(System.getProperty("user.home") + "/Desktop" + "/sumit/"+ ed.filename);
IOUtils.copy(ed.is, fos);
fos.close();
}
ir.close();
}
static byte[] getSamplePng() throws IOException {
ClassLoader cl = Thread.currentThread().getContextClassLoader();
URL imgUrl = cl.getResource("javax/swing/plaf/metal/icons/ocean/directory.gif");
BufferedImage img = ImageIO.read(imgUrl);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ImageIO.write(img, "PNG", bos);
return bos.toByteArray();
}
public EmbeddedReader(String excel_path) throws IOException {
excel_file = new File(excel_path);
image_reader = new ImageReader(excel_file);
}
public String[] get_file_names() {
ArrayList<String> file_names = new ArrayList<String>();
for (EmbeddedData ed : image_reader.embeddings) {
file_names.add(ed.filename);
}
return file_names.toArray(new String[file_names.size()]);
}
public InputStream get_stream(String file_name) {
InputStream input_stream = null;
for (EmbeddedData ed : image_reader.embeddings) {
if (file_name.equals(ed.filename)) {
input_stream = ed.is;
break;
}
}
return input_stream;
}
static class ImageReader implements Closeable {
EmbeddedExtractor extractors[] = { new Ole10Extractor(),new PdfExtractor(), new WordExtractor(), new ExcelExtractor(),new FsExtractor() };
List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
Workbook wb;
public ImageReader(File excelfile) throws IOException {
try {
wb = WorkbookFactory.create(excelfile);
Sheet receiptImages = wb.getSheet("Receipt images");
if (wb instanceof XSSFWorkbook) {
addSheetPicsAndEmbedds((XSSFSheet) receiptImages);
} else {
addAllEmbedds((HSSFWorkbook) wb);
addSheetPics((HSSFSheet) receiptImages);
}
} catch (Exception e) {
e.printStackTrace();
}
}
protected void addSheetPicsAndEmbedds(XSSFSheet sheet)throws IOException {
if (sheet == null)
return;
XSSFDrawing draw = sheet.createDrawingPatriarch();
for (XSSFShape shape : draw.getShapes()) {
if (!(shape instanceof XSSFPicture))
continue;
XSSFPicture picture = (XSSFPicture) shape;
XSSFPictureData pd = picture.getPictureData();
PackagePart pp = pd.getPackagePart();
CTPicture ctPic = picture.getCTPicture();
String filename = null;
try {
filename = ctPic.getNvPicPr().getCNvPr().getName();
} catch (Exception e) {
}
if (filename == null || "".equals(filename)) {
filename = new File(pp.getPartName().toString()).getName();
}
EmbeddedData ed = new EmbeddedData();
ed.filename = fileNameWithoutPath(filename);
ed.is = pp.getInputStream();
embeddings.add(ed);
}
}
protected void addAllEmbedds(HSSFWorkbook hwb) throws IOException {
for (HSSFObjectData od : hwb.getAllEmbeddedObjects()) {
String alternativeName = getAlternativeName(od);
if (od.hasDirectoryEntry()) {
DirectoryNode src = (DirectoryNode) od.getDirectory();
for (EmbeddedExtractor ee : extractors) {
if (ee.canExtract(src)) {
EmbeddedData ed = ee.extract(src);
if (ed.filename == null || ed.filename.startsWith("MBD")|| alternativeName != null) {
if (alternativeName != null) {
ed.filename = alternativeName;
}
}
ed.filename = fileNameWithoutPath(ed.filename);
ed.source = "object";
embeddings.add(ed);
break;
}
}
}
}
}
protected String getAlternativeName(HSSFShape shape) {
EscherOptRecord eor = reflectEscherOptRecord(shape);
if (eor == null) {
return null;
}
for (EscherProperty ep : eor.getEscherProperties()) {
if ("groupshape.shapename".equals(ep.getName())
&& ep.isComplex()) {
return new String(
((EscherComplexProperty) ep).getComplexData(),
Charset.forName("UTF-16LE"));
}
}
return null;
}
protected void addSheetPics(HSSFSheet sheet) {
if (sheet == null)
return;
int picIdx = 0;
int emfIdx = 0;
HSSFPatriarch patriarch = sheet.getDrawingPatriarch();
if (patriarch == null)
return;
// Loop through the objects
for (HSSFShape shape : patriarch.getChildren()) {
if (!(shape instanceof HSSFPicture)) {
continue;
}
HSSFPicture picture = (HSSFPicture) shape;
if (picture.getShapeType() != HSSFSimpleShape.OBJECT_TYPE_PICTURE)
continue;
HSSFPictureData pd = picture.getPictureData();
byte pictureBytes[] = pd.getData();
int pictureBytesOffset = 0;
int pictureBytesLen = pictureBytes.length;
String filename = picture.getFileName();
// try to find an alternative name
if (filename == null || "".equals(filename)) {
filename = getAlternativeName(picture);
}
// default to dummy name
if (filename == null || "".equals(filename)) {
filename = "picture" + (picIdx++);
}
filename = filename.trim();
// check for emf+ embedded pdf (poor mans style :( )
// Mac Excel 2011 embeds pdf files with this method.
boolean validFile = true;
if (pd.getFormat() == Workbook.PICTURE_TYPE_EMF) {
validFile = false;
int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes());
if (idxStart != -1) {
int idxEnd = indexOf(pictureBytes, idxStart,"%%EOF".getBytes());
if (idxEnd != -1) {
pictureBytesOffset = idxStart;
pictureBytesLen = idxEnd - idxStart + 6;
validFile = true;
}
} else {
// This shape was not a Mac Excel 2011 embedded pdf file.
// So this is a shape related to a regular embedded object
// Lets update the object filename with the shapes filename
// if the object filename is of format ARGF1234.pdf
EmbeddedData ed_obj = embeddings.get(emfIdx);
Pattern pattern = Pattern
.compile("^[A-Z0-9]{8}\\.[pdfPDF]{3}$");
Matcher matcher = pattern.matcher(ed_obj.filename);
if (matcher.matches()) {
ed_obj.filename = filename;
}
emfIdx += 1;
}
}
EmbeddedData ed = new EmbeddedData();
ed.filename = fileNameWithoutPath(filename);
ed.is = new ByteArrayInputStream(pictureBytes,
pictureBytesOffset, pictureBytesLen);
if (fileNotInEmbeddings(ed.filename) && validFile) {
embeddings.add(ed);
}
}
}
private static EscherOptRecord reflectEscherOptRecord(HSSFShape shape) {
try {
Method m = HSSFShape.class.getDeclaredMethod("getOptRecord");
m.setAccessible(true);
return (EscherOptRecord) m.invoke(shape);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
private String fileNameWithoutPath(String filename) {
int last_index = filename.lastIndexOf("\\");
return filename.substring(last_index + 1);
}
private boolean fileNotInEmbeddings(String filename) {
boolean exists = true;
for (EmbeddedData ed : embeddings) {
if (ed.filename.equals(filename)) {
exists = false;
}
}
return exists;
}
public void close() throws IOException {
Iterator<EmbeddedData> ed = embeddings.iterator();
while (ed.hasNext()) {
ed.next().is.close();
}
wb.close();
}
}
static class EmbeddedData {
String filename;
InputStream is;
String source;
}
static abstract class EmbeddedExtractor {
abstract boolean canExtract(DirectoryNode dn);
abstract EmbeddedData extract(DirectoryNode dn) throws IOException;
protected EmbeddedData extractFS(DirectoryNode dn, String filename)
throws IOException {
assert (canExtract(dn));
POIFSFileSystem dest = new POIFSFileSystem();
copyNodes(dn, dest.getRoot());
EmbeddedData ed = new EmbeddedData();
ed.filename = filename;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
dest.writeFilesystem(bos);
bos.close();
ed.is = new ByteArrayInputStream(bos.toByteArray());
return ed;
}
}
static class Ole10Extractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return OLE10_PACKAGE.equals(clsId);
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
try {
Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
EmbeddedData ed = new EmbeddedData();
ed.filename = new File(ole10.getFileName()).getName();
ed.is = new ByteArrayInputStream(ole10.getDataBuffer());
return ed;
} catch (Ole10NativeException e) {
e.printStackTrace();
throw new IOException(e);
}
}
}
static class PdfExtractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (PdfClassID.equals(clsId) || dn.hasEntry("CONTENTS"));
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = new EmbeddedData();
ed.is = dn.createDocumentInputStream("CONTENTS");
ed.filename = dn.getName() + ".pdf";
return ed;
}
}
static class WordExtractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (WORD95.equals(clsId) || WORD97.equals(clsId) || dn.hasEntry("WordDocument"));
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
return extractFS(dn, dn.getName() + ".doc");
}
}
static class ExcelExtractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (EXCEL95.equals(clsId) || EXCEL97.equals(clsId) || dn
.hasEntry("Workbook") /* ... */);
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
return extractFS(dn, dn.getName() + ".xls");
}
}
static class FsExtractor extends EmbeddedExtractor {
public boolean canExtract(DirectoryNode dn) {
return true;
}
public EmbeddedData extract(DirectoryNode dn) throws IOException {
return extractFS(dn, dn.getName() + ".dat");
}
}
private static void copyNodes(DirectoryNode src, DirectoryNode dest)
throws IOException {
for (Entry e : src) {
if (e instanceof DirectoryNode) {
DirectoryNode srcDir = (DirectoryNode) e;
DirectoryNode destDir = (DirectoryNode) dest
.createDirectory(srcDir.getName());
destDir.setStorageClsid(srcDir.getStorageClsid());
copyNodes(srcDir, destDir);
} else {
InputStream is = src.createDocumentInputStream(e);
dest.createDocument(e.getName(), is);
is.close();
}
}
}
/**
* Knuth-Morris-Pratt Algorithm for Pattern Matching Finds the first
* occurrence of the pattern in the text.
*/
private static int indexOf(byte[] data, int offset, byte[] pattern) {
int[] failure = computeFailure(pattern);
int j = 0;
if (data.length == 0)
return -1;
for (int i = offset; i < data.length; i++) {
while (j > 0 && pattern[j] != data[i]) {
j = failure[j - 1];
}
if (pattern[j] == data[i]) {
j++;
}
if (j == pattern.length) {
return i - pattern.length + 1;
}
}
return -1;
}
/**
* Computes the failure function using a boot-strapping process, where the
* pattern is matched against itself.
*/
private static int[] computeFailure(byte[] pattern) {
int[] failure = new int[pattern.length];
int j = 0;
for (int i = 1; i < pattern.length; i++) {
while (j > 0 && pattern[j] != pattern[i]) {
j = failure[j - 1];
}
if (pattern[j] == pattern[i]) {
j++;
}
failure[i] = j;
}
return failure;
}
}
答案 3 :(得分:1)
为了简化嵌入数据的处理,我在POI中添加了一个提取器类,在此之前将在POI 3.16-beta2或nightly中提供。
以下将提取.xls / x文件的对象 - 剩下的就是在某处写入嵌入的字节。只需扩展EmbeddedExtractor并提供自己的iterator()
方法,就可以扩展提取器类。
import java.io.FileInputStream;
import java.io.InputStream;
import org.apache.poi.ss.extractor.EmbeddedData;
import org.apache.poi.ss.extractor.EmbeddedExtractor;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
public class BlaExtract {
public static void main(String[] args) throws Exception {
InputStream fis = new FileInputStream("bla.xlsx");
Workbook wb = WorkbookFactory.create(fis);
fis.close();
EmbeddedExtractor ee = new EmbeddedExtractor();
for (Sheet s : wb) {
for (EmbeddedData ed : ee.extractAll(s)) {
System.out.println(ed.getFilename()+" ("+ed.getContentType()+") - "+ed.getEmbeddedData().length+" bytes");
}
}
wb.close();
}
}