找到注释纯文本

时间:2016-07-11 14:29:17

标签: c# pdf itext

我在下面注明了注释中出现的PDF文本的一部分。我成功找到了注释,但是如何返回相应的纯文本?注释包含以下字段:

-       Keys    Count = 12  Dictionary<PdfName,PdfObject>.KeyCollection
+       [0] {/C}    PdfName
+       [1] {/F}    PdfName
+       [2] {/M}    PdfName
+       [3] {/P}    PdfName
+       [4] {/T}    PdfName
+       [5] {/AP}   PdfName
+       [6] {/NM}   PdfName
+       [7] {/Rect} PdfName
+       [8] {/Subj} PdfName
+       [9] {/Subtype}  PdfName
+       [10]    {/QuadPoints}   PdfName
+       [11]    {/CreationDate} PdfName

我尝试搜索'/NM'值与'reader.GetNamedDestinationFromNames()''reader.GetNamedDestinationFromStrings()'之间的对应关系,但这两个字典都是空的。

1 个答案:

答案 0 :(得分:0)

感谢大家的帮助;)
这是(沉重但有效的)答案。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;


using iTextSharp.text.pdf;

// PdfTextExtractor
using iTextSharp.text.pdf.parser;

namespace PdfParsingiTextSharp {

然后是标记集合的代码示例......

/*
 *  marker element, in order to build a collection
 */
public class cMark : IComparable {

    public enum TypeMarker{
        TypeSignet,
        TypeAnnotation
        };

    public enum TypeAnnotationSubType{
        TypeAnnotation_NONE,
        TypeAnnotation_UNDERLINE,
        TypeAnnotation_HIGHLIGHT,
        TypeAnnotation_STRIKEOUT,
        TypeAnnotation_SQUIGGLY
        };

    public TypeMarker eType;
    public TypeAnnotationSubType eAnnotationSubType;

    // level of signet
    public int signetLevel;
    // page in document
    public int pageNum;
    // indirect reference of page
    public int pageRef;
    // text of signet or annotation
    public String title;
    // area rectangle of annotation
    public iTextSharp.text.Rectangle annotRect;

    public cMark( TypeMarker p_eType, TypeAnnotationSubType p_TypeAnnotationSubType) {
        eType = p_eType;
        eAnnotationSubType = p_TypeAnnotationSubType;
        signetLevel = -1;
        pageNum = -1;
        pageRef = -1;
        title = "";
        annotRect = null;
        }

    /**
     * compare first on page, then on row, and finaly on column
     */
    public int CompareTo( object obj ) {
        cMarker compareObj = (cMarker)obj;
        int pageTest  = compareObj.pageNum.CompareTo(this.pageNum);
        if (pageTest != 0) {
            return pageTest;
            }
        else {
            if (annotRect == null) {
                return 0;
                }
            else {
                int rowTest  = compareObj.annotRect.Top.CompareTo( this.annotRect.Top);
                if (rowTest != 0) {
                    return rowTest;
                    }
                else {
                    return compareObj.annotRect.Left.CompareTo(this.annotRect.Left);
                    }
                }
            }
        }
    }

然后解析注释。

// parsing annotation in document
    public static class Demo {

        /* Parse PDf file annotations
            */
        static void parseAnnotations( PdfReader reader, List<cMark> markers) {

            markers.Clear();

            // on each page
            for(int pg = 1; pg < reader.NumberOfPages+1; pg++) {

                PdfDictionary pagedic = reader.GetPageN( pg );
                // get annotations array
                PdfArray annotarray = (PdfArray)PdfReader.GetPdfObject( pagedic.Get( PdfName.ANNOTS ) );
                // if no annotation ...
                if (annotarray == null || annotarray.Size == 0) {
                    continue;
                    }

                // on each annotation reference...
                foreach(PdfIndirectReference annot in annotarray.ArrayList) {

                    PdfDictionary annotationDic = (PdfDictionary)PdfReader.GetPdfObject( annot );

                    PdfName subType = (PdfName)annotationDic.Get( PdfName.SUBTYPE );

                    PdfString contents = annotationDic.GetAsString( PdfName.CONTENTS );

                    // if simple text...
                    if (    (contents != null) &&
                            (   (subType.Equals( PdfName.TEXT )) || 
                                (subType.Equals( PdfName.FREETEXT ))
                            ) 
                        ) {
                        String value = contents.ToString();

                        // single marker element 
                        cMark mrk = new cMark(cMark.TypeMarker.TypeAnnotation, cMark.TypeAnnotationSubType.TypeAnnotation_NONE);
                        mrk.pageNum = pg;
                        mrk.title = value;

                        if (annotationDic.Get( PdfName.RECT ) != null) {
                            PdfArray coord = annotationDic.GetAsArray( PdfName.RECT );
                            PdfRectangle textRect = new PdfRectangle( 
                                ((PdfNumber)coord[0]).FloatValue, 
                                ((PdfNumber)coord[1]).FloatValue, 
                                ((PdfNumber)coord[2]).FloatValue, 
                                ((PdfNumber)coord[3]).FloatValue);

                            mrk.annotRect = textRect.Rectangle;
                            }

                        markers.Add( mrk);
                        }

                    // if decorated text...
                    if (    (subType.Equals( PdfName.UNDERLINE )) || 
                            (subType.Equals( PdfName.HIGHLIGHT )) || 
                            (subType.Equals( PdfName.STRIKEOUT )) || 
                            (subType.Equals( PdfName.SQUIGGLY )) ) {

                        cMark mrk = new cMark(cMark.TypeMarker.TypeAnnotation, cMark.TypeAnnotationSubType.TypeAnnotation_NONE);
                        mrk.pageNum = pg;

                        if (subType.Equals( PdfName.UNDERLINE )) {
                            mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_UNDERLINE;
                            }
                        else if (subType.Equals( PdfName.HIGHLIGHT )) {
                            mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_HIGHLIGHT;
                            }
                        else if (subType.Equals( PdfName.STRIKEOUT )) {
                            mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_STRIKEOUT;
                            }
                        else if (subType.Equals( PdfName.SQUIGGLY )) {
                            mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_SQUIGGLY;
                            }

                        PdfObject pdfObjectQuad = annotationDic.Get( PdfName.QUADPOINTS );
                        if (pdfObjectQuad != null) {
                            PdfArray rect = annotationDic.GetAsArray( PdfName.QUADPOINTS );
                            // float llx, float lly, float urx, float ury
                            float lowX = Math.Min( ((PdfNumber)rect[0]).FloatValue, ((PdfNumber)rect[2]).FloatValue);
                            lowX = Math.Min( lowX, ((PdfNumber)rect[4]).FloatValue);
                            lowX = Math.Min( lowX, ((PdfNumber)rect[6]).FloatValue);

                            float lowY = Math.Min( ((PdfNumber)rect[1]).FloatValue, ((PdfNumber)rect[3]).FloatValue);
                            lowY = Math.Min( lowY, ((PdfNumber)rect[5]).FloatValue);
                            lowY = Math.Min( lowY, ((PdfNumber)rect[7]).FloatValue);

                            float upX = Math.Max( ((PdfNumber)rect[0]).FloatValue, ((PdfNumber)rect[2]).FloatValue);
                            upX = Math.Max( upX, ((PdfNumber)rect[4]).FloatValue);
                            upX = Math.Max( upX, ((PdfNumber)rect[6]).FloatValue);

                            float upY = Math.Max( ((PdfNumber)rect[1]).FloatValue, ((PdfNumber)rect[3]).FloatValue);
                            upY = Math.Max( upY, ((PdfNumber)rect[5]).FloatValue);
                            upY = Math.Max( upY, ((PdfNumber)rect[7]).FloatValue);

                            PdfRectangle textRect = new PdfRectangle( lowX, lowY, upX, upY);
                            RenderFilter[] filter = { new RegionTextRenderFilter(textRect.Rectangle) };
                            ITextExtractionStrategy strategy;
                            StringBuilder sb = new StringBuilder();
                            for (int i = 1; i <= reader.NumberOfPages; i++) {
                                strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                                sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
                                }
                            String result = sb.ToString();
                            mrk.title = result;
                            mrk.annotRect = textRect.Rectangle;
                            markers.Add( mrk);
                            }
                        }
                    }
                }
            }
        }