我有一个要求,我必须解析DOCX文件并提取所有文本和图像。我正在使用OpenxmlSDK 2.5实现此目的。我能够解析图像和文本,但是DOCX也有一组Shapes,我试图解析它们并将其转换为Drawing图像,这给了我错误的结果。
Here是我要解析的示例docx文件。
我提到了这个Stack overflow discussion,并尝试了相同的方法,但是没有运气。
我使用以下代码创建的resulting DOCX没有解析的图像。
using System.Collections.Generic;
using System.Linq;
using System.IO;
using System.Drawing;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using DocumentFormat.OpenXml.Vml;
using DocumentFormat.OpenXml;
namespace ReadGroupShape
{
class Program
{
static List<Bitmap> images = new List<Bitmap>();
static void Main(string[] args)
{
MainDocumentPart mainPart = null;
Body content = null;
WordprocessingDocument newDoc = WordprocessingDocument.Create("NewDocx.docx", WordprocessingDocumentType.Document);
MainDocumentPart newMainPart = newDoc.AddMainDocumentPart();
newMainPart.Document = new Document();
Body newbody = newMainPart.Document.AppendChild(new Body());
byte[] docBytes = File.ReadAllBytes("SampleDoc.docx");
using (MemoryStream ms = new MemoryStream())
{
ms.Write(docBytes, 0, docBytes.Length);
using (WordprocessingDocument wpDoc = WordprocessingDocument.Open(ms, true))
{
mainPart = wpDoc.MainDocumentPart;
content = mainPart.Document.Body;
foreach (Paragraph par in content.Descendants<Paragraph>())
{
Paragraph npar = newbody.AppendChild(new Paragraph());
foreach (Run run in par.Descendants<Run>())
{
Run nrun = npar.AppendChild(new Run());
DocumentFormat.OpenXml.Drawing.Blip pic = run.Descendants<DocumentFormat.OpenXml.Drawing.Blip>().FirstOrDefault();
ImageData imageData = run.Descendants<ImageData>().FirstOrDefault();
if (pic == null && imageData == null)
{
nrun.InsertAfterSelf(run.CloneNode(true));
}
else
{
if (pic != null)
{
nrun.InsertAfterSelf(CreateImageFromBlip(wpDoc, run, newMainPart, pic));
}
else if (imageData != null)
{
nrun.InsertAfterSelf(CreateImageFromShape(wpDoc, run, newMainPart, imageData));
}
}
}
}
mainPart.Document.Save();
}
}
newMainPart.Document.Save();
newDoc.Close();
}
private static Run CreateImageFromShape(WordprocessingDocument sourceDoc, Run sourceRun, MainDocumentPart mainpart, ImageData imageData)
{
ImagePart p = sourceDoc.MainDocumentPart.GetPartById(imageData.RelationshipId) as ImagePart;
return CreateImageRun(sourceDoc, sourceRun, mainpart, p);
}
private static Run CreateImageFromBlip(WordprocessingDocument sourceDoc, Run sourceRun, MainDocumentPart mainpart, DocumentFormat.OpenXml.Drawing.Blip blip)
{
ImagePart newPart = mainpart.AddImagePart(ImagePartType.Png);
ImagePart p = sourceDoc.MainDocumentPart.GetPartById(blip.Embed.Value) as ImagePart;
Bitmap image = new Bitmap(p.GetStream());
using (Stream s = p.GetStream())
{
s.Position = 0;
newPart.FeedData(s);
}
string partId = mainpart.GetIdOfPart(newPart);
Drawing newImage = CreateImage(partId);
return new Run(newImage);
}
private static Run CreateImageRun(WordprocessingDocument sourceDoc, Run sourceRun, MainDocumentPart mainpart, ImagePart p)
{
ImagePart newPart = mainpart.AddImagePart(ImagePartType.Png);
using (Stream s = p.GetStream())
{
s.Position = 0;
newPart.FeedData(s);
}
string partId = mainpart.GetIdOfPart(newPart);
Drawing newImage = CreateImage(partId);
return new Run(newImage);
}
private static Drawing CreateImage(string relationshipId)
{
// Define the reference of the image.
return new Drawing(
new DocumentFormat.OpenXml.Drawing.Wordprocessing.Inline(
new DocumentFormat.OpenXml.Drawing.Wordprocessing.Extent() { Cx = 990000L, Cy = 792000L },
new DocumentFormat.OpenXml.Drawing.Wordprocessing.EffectExtent()
{
LeftEdge = 0L,
TopEdge = 0L,
RightEdge = 0L,
BottomEdge = 0L
},
new DocumentFormat.OpenXml.Drawing.Wordprocessing.DocProperties()
{
Id = (UInt32Value)1U,
Name = "Picture 1"
},
new DocumentFormat.OpenXml.Drawing.Wordprocessing.NonVisualGraphicFrameDrawingProperties(
new DocumentFormat.OpenXml.Drawing.GraphicFrameLocks() { NoChangeAspect = true }),
new DocumentFormat.OpenXml.Drawing.Graphic(
new DocumentFormat.OpenXml.Drawing.GraphicData(
new DocumentFormat.OpenXml.Drawing.Picture(
new DocumentFormat.OpenXml.Drawing.NonVisualPictureProperties(
new DocumentFormat.OpenXml.Drawing.NonVisualDrawingProperties()
{
Id = (UInt32Value)0U,
Name = "New Bitmap Image.jpg"
},
new DocumentFormat.OpenXml.Drawing.NonVisualPictureDrawingProperties()),
new DocumentFormat.OpenXml.Drawing.BlipFill(
new DocumentFormat.OpenXml.Drawing.Blip(
new DocumentFormat.OpenXml.Drawing.BlipExtensionList(
new DocumentFormat.OpenXml.Drawing.BlipExtension()
{
Uri =
"{28A0092B-C50C-407E-A947-70E740481C1C}"
})
)
{
Embed = relationshipId,
CompressionState =
DocumentFormat.OpenXml.Drawing.BlipCompressionValues.Print
},
new DocumentFormat.OpenXml.Drawing.Stretch(
new DocumentFormat.OpenXml.Drawing.FillRectangle())),
new DocumentFormat.OpenXml.Drawing.ShapeProperties(
new DocumentFormat.OpenXml.Drawing.Transform2D(
new DocumentFormat.OpenXml.Drawing.Offset() { X = 0L, Y = 0L },
new DocumentFormat.OpenXml.Drawing.Extents() { Cx = 990000L, Cy = 792000L }),
new DocumentFormat.OpenXml.Drawing.PresetGeometry(
new DocumentFormat.OpenXml.Drawing.AdjustValueList()
)
{ Preset = DocumentFormat.OpenXml.Drawing.ShapeTypeValues.Rectangle }))
)
{ Uri = "http://schemas.openxmlformats.org/drawingml/2006/picture" })
)
{
DistanceFromTop = (UInt32Value)0U,
DistanceFromBottom = (UInt32Value)0U,
DistanceFromLeft = (UInt32Value)0U,
DistanceFromRight = (UInt32Value)0U,
EditId = "50D07946"
});
}
}
}
我想念的是什么?有人可以帮我解析形状和图像吗?
谢谢。