如何从pdf文件中复制突出显示的文本

时间:2018-10-30 05:33:56

标签: c#

我正在使用 itextsharp 库来开发C#应用程序,以将来自两个不同PDF文件的所有注释注释合并到另一个PDF文件中,请事先帮助我,我已经尝试过代码 我使用了此代码,但能够找到突出显示的文本,但格式不正确。

using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace PdfFileApp
{
public class pdftotext
 {
     public static void ReadAnnotation()
     {
         int pageTo = 0;
         try
         {
             using (iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader("D:\\DEMO_Supp_First Proof.pdf"))
              {
                 pageTo = reader.NumberOfPages;
                 for (int i = 1; i <= reader.NumberOfPages; i++)
                 {
                     PdfDictionary page = reader.GetPageN(i);
                     PdfArray annots = page.GetAsArray(iTextSharp.text.pdf.PdfName.ANNOTS);
                      if (annots != null)
                         foreach (PdfObject annot in annots.ArrayList)
                         {
                            PdfDictionary annotationDic = (PdfDictionary)iTextSharp.text.pdf.PdfReader.GetPdfObject(annot);
                            PdfDictionary pdfDictionary = annots.GetAsDict(i);
                            PdfName subType = (PdfName)annotationDic.Get(PdfName.SUBTYPE);

                            var author = pdfDictionary.GetAsString(PdfName.T);
                            if (subType.Equals(PdfName.HIGHLIGHT))
                            {
                                PdfArray coordinates = annotationDic.GetAsArray(PdfName.RECT);

                                iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(float.Parse(coordinates.ArrayList[0].ToString(), CultureInfo.InvariantCulture.NumberFormat), float.Parse(coordinates.ArrayList[1].ToString(), CultureInfo.InvariantCulture.NumberFormat),
                                 float.Parse(coordinates.ArrayList[2].ToString(), CultureInfo.InvariantCulture.NumberFormat), float.Parse(coordinates.ArrayList[3].ToString(), CultureInfo.InvariantCulture.NumberFormat));

                                RenderFilter[] filter = { new RegionTextRenderFilter(rect) };
                                ITextExtractionStrategy strategy;
                                StringBuilder sb = new StringBuilder();

                                strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                                sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
                                Console.WriteLine(sb.ToString());
                                Console.ReadLine();
                                var annotatedWord = sb.Replace(System.Environment.NewLine, string.Empty);
                            }
                        }
                }
            }
        }
        catch (Exception ex)
        {
            string error = ex.Message;
        }
    }
}
}

0 个答案:

没有答案