从XPS文档中提取文本

时间:2012-09-04 11:04:20

标签: c# text extraction xps

我需要从XPS文档中提取特定页面的文本。 提取的文本应该用字符串写。我需要这个使用Microsofts SpeechLib读出提取的文本。 请仅在C#中使用示例。

由于

4 个答案:

答案 0 :(得分:10)

添加对ReachFrameworkWindowsBase的引用以及以下using声明:

using System.Windows.Xps.Packaging;

然后使用此代码:

XpsDocument _xpsDocument=new XpsDocument("/path",System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader 
    =_xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
IXpsFixedPageReader _page 
    = _document.FixedPages[documentViewerElement.MasterPageNumber];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
  while (_pageContentReader.Read())
  {
    if (_pageContentReader.Name == "Glyphs")
    {
      if (_pageContentReader.HasAttributes)
      {
        if (_pageContentReader.GetAttribute("UnicodeString") != null )
        {                                   
          _currentText.
            Append(_pageContentReader.
            GetAttribute("UnicodeString"));                              
        }
      }
    }
  }
}
string _fullPageText = _currentText.ToString();

文字存在于Glyphs - > UnicodeString字符串属性。您必须使用XMLReader作为固定页面。

答案 1 :(得分:1)

    private string ReadXpsFile(string fileName)
    {
        XpsDocument _xpsDocument = new XpsDocument(fileName, System.IO.FileAccess.Read);
        IXpsFixedDocumentSequenceReader fixedDocSeqReader
            = _xpsDocument.FixedDocumentSequenceReader;
        IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
        FixedDocumentSequence sequence = _xpsDocument.GetFixedDocumentSequence();
        string _fullPageText="";
        for (int pageCount = 0; pageCount < sequence.DocumentPaginator.PageCount; ++pageCount)
        {
            IXpsFixedPageReader _page
                = _document.FixedPages[pageCount];
            StringBuilder _currentText = new StringBuilder();
            System.Xml.XmlReader _pageContentReader = _page.XmlReader;
            if (_pageContentReader != null)
            {
                while (_pageContentReader.Read())
                {
                    if (_pageContentReader.Name == "Glyphs")
                    {
                        if (_pageContentReader.HasAttributes)
                        {
                            if (_pageContentReader.GetAttribute("UnicodeString") != null)
                            {
                                _currentText.
                                  Append(_pageContentReader.
                                  GetAttribute("UnicodeString"));
                            }
                        }
                    }
                }
            }
            _fullPageText += _currentText.ToString();
        }
        return _fullPageText;
    }

答案 2 :(得分:1)

从所有页面返回文本的方法(修改后的Amir:s代码,希望没问题):

/// <summary>
///   Get all text strings from an XPS file.
///   Returns a list of lists (one for each page) containing the text strings.
/// </summary>
private static List<List<string>> ExtractTextFromXps(string xpsFilePath)
{
   var xpsDocument = new XpsDocument(xpsFilePath, FileAccess.Read);
   var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
   if (fixedDocSeqReader == null)
      return null;

   const string UnicodeString = "UnicodeString";
   const string GlyphsString = "Glyphs";

   var textLists = new List<List<string>>();
   foreach (IXpsFixedDocumentReader fixedDocumentReader in fixedDocSeqReader.FixedDocuments)
   {
      foreach (IXpsFixedPageReader pageReader in fixedDocumentReader.FixedPages)
      {
         var pageContentReader = pageReader.XmlReader;
         if (pageContentReader == null)
            continue;

         var texts = new List<string>();
         while (pageContentReader.Read())
         {
            if (pageContentReader.Name != GlyphsString)
               continue;
            if (!pageContentReader.HasAttributes)
               continue;
            if (pageContentReader.GetAttribute(UnicodeString) != null)
               texts.Add(pageContentReader.GetAttribute(UnicodeString));
         }
         textLists.Add(texts);   
      }
   }
   xpsDocument.Close();
   return textLists;
}

用法:

var txtLists = ExtractTextFromXps(@"C:\myfile.xps");

int pageIdx = 0;
foreach (List<string> txtList in txtLists)
{
   pageIdx++;
   Console.WriteLine("== Page {0} ==", pageIdx);
   foreach (string txt in txtList)
      Console.WriteLine(" "+txt);
   Console.WriteLine();
}

答案 3 :(得分:0)

完整的课程代码:

using System.Collections.Generic;
using System.Drawing;
using System.Windows.Forms;
using System.Windows.Xps.Packaging;

namespace XPS_Data_Transfer
{
    internal static class XpsDataReader
    {
        public static List<string> ReadXps(string address, int pageNumber)
        {
            var xpsDocument = new XpsDocument(address, System.IO.FileAccess.Read);
            var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
            if (fixedDocSeqReader == null) return null;

            const string uniStr = "UnicodeString";
            const string glyphs = "Glyphs";
            var document = fixedDocSeqReader.FixedDocuments[pageNumber - 1];
            var page = document.FixedPages[0];
            var currentText = new List<string>();
            var pageContentReader = page.XmlReader;

            if (pageContentReader == null) return null;
            while (pageContentReader.Read())
            {
                if (pageContentReader.Name != glyphs) continue;
                if (!pageContentReader.HasAttributes) continue;
                if (pageContentReader.GetAttribute(uniStr) != null)
                    currentText.Add(Dashboard.CleanReversedPersianText(pageContentReader.GetAttribute(uniStr)));
            }
            return currentText;
        }
    }
}

从自定义文件的自定义页面返回字符串数据列表。