我有一个使用Office 2010(和2007)构建的PowerPoint幻灯片,我需要以编程方式从中提取文本。我猜想Office会创建一个可能包含我需要的所有文本的xml文件。
有没有办法做到这一点,我该怎么回事呢?
我有VS2010,SharePoint Designer 2007可用于工具。
谢谢,
Risho
答案 0 :(得分:0)
是的,使用Linq-to-XML有一种简单而且更简单的方法。请注意,我不使用Open XML SDK - 我只是使用带有XML Literals和System.IO.Packaging的VB.NET。您当然可以使用SDK,C#等以更复杂的方式执行此操作 - 取决于您的环境/偏好。
以下是你将如何做#2(简单方法):
Imports System.IO
Imports System.IO.Packaging 'Add reference to WindowsBase for this
Imports <xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
Imports <xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
Imports <xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
Module Module1
Public Const documentRelationshipType As String = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"
Sub Main()
Dim slide, document As XElement
Dim pptPackage As Package = Nothing
Dim slidePart, documentPart As PackagePart
Dim filePath As String = "C:\Users\Todd\Desktop\yourpresentation.pptx"
pptPackage = Package.Open(filePath, FileMode.Open, FileAccess.ReadWrite)
Using pptPackage
Dim documentRelationship As PackageRelationship = pptPackage.GetRelationshipsByType(documentRelationshipType).FirstOrDefault
Dim documentUri As Uri = PackUriHelper.ResolvePartUri(New Uri("/", UriKind.Relative), documentRelationship.TargetUri)
documentPart = pptPackage.GetPart(documentUri)
document = XElement.Load(New StreamReader(documentPart.GetStream))
Dim slideList = From e In document.<p:sldIdLst>.<p:sldId>
For i = 0 To slideList.Count - 1
Dim slideReference As String = slideList(i).@r:id.ToString
slidePart = pptPackage.GetPart(PackUriHelper.ResolvePartUri(documentPart.Uri, documentPart.GetRelationship(slideReference).TargetUri))
slide = XElement.Load(New StreamReader(slidePart.GetStream))
Dim rawText = From e In slide...<a:t>
For Each t In rawText
Console.WriteLine(t.Value)
Next
Next
End Using
Console.ReadLine()
End Sub
End Module