我在下面注明了注释中出现的PDF文本的一部分。我成功找到了注释,但是如何返回相应的纯文本?注释包含以下字段:
- Keys Count = 12 Dictionary<PdfName,PdfObject>.KeyCollection
+ [0] {/C} PdfName
+ [1] {/F} PdfName
+ [2] {/M} PdfName
+ [3] {/P} PdfName
+ [4] {/T} PdfName
+ [5] {/AP} PdfName
+ [6] {/NM} PdfName
+ [7] {/Rect} PdfName
+ [8] {/Subj} PdfName
+ [9] {/Subtype} PdfName
+ [10] {/QuadPoints} PdfName
+ [11] {/CreationDate} PdfName
我尝试搜索'/NM'
值与'reader.GetNamedDestinationFromNames()'
或'reader.GetNamedDestinationFromStrings()'
之间的对应关系,但这两个字典都是空的。
答案 0 :(得分:0)
感谢大家的帮助;)
这是(沉重但有效的)答案。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using iTextSharp.text.pdf;
// PdfTextExtractor
using iTextSharp.text.pdf.parser;
namespace PdfParsingiTextSharp {
然后是标记集合的代码示例......
/*
* marker element, in order to build a collection
*/
public class cMark : IComparable {
public enum TypeMarker{
TypeSignet,
TypeAnnotation
};
public enum TypeAnnotationSubType{
TypeAnnotation_NONE,
TypeAnnotation_UNDERLINE,
TypeAnnotation_HIGHLIGHT,
TypeAnnotation_STRIKEOUT,
TypeAnnotation_SQUIGGLY
};
public TypeMarker eType;
public TypeAnnotationSubType eAnnotationSubType;
// level of signet
public int signetLevel;
// page in document
public int pageNum;
// indirect reference of page
public int pageRef;
// text of signet or annotation
public String title;
// area rectangle of annotation
public iTextSharp.text.Rectangle annotRect;
public cMark( TypeMarker p_eType, TypeAnnotationSubType p_TypeAnnotationSubType) {
eType = p_eType;
eAnnotationSubType = p_TypeAnnotationSubType;
signetLevel = -1;
pageNum = -1;
pageRef = -1;
title = "";
annotRect = null;
}
/**
* compare first on page, then on row, and finaly on column
*/
public int CompareTo( object obj ) {
cMarker compareObj = (cMarker)obj;
int pageTest = compareObj.pageNum.CompareTo(this.pageNum);
if (pageTest != 0) {
return pageTest;
}
else {
if (annotRect == null) {
return 0;
}
else {
int rowTest = compareObj.annotRect.Top.CompareTo( this.annotRect.Top);
if (rowTest != 0) {
return rowTest;
}
else {
return compareObj.annotRect.Left.CompareTo(this.annotRect.Left);
}
}
}
}
}
然后解析注释。
// parsing annotation in document
public static class Demo {
/* Parse PDf file annotations
*/
static void parseAnnotations( PdfReader reader, List<cMark> markers) {
markers.Clear();
// on each page
for(int pg = 1; pg < reader.NumberOfPages+1; pg++) {
PdfDictionary pagedic = reader.GetPageN( pg );
// get annotations array
PdfArray annotarray = (PdfArray)PdfReader.GetPdfObject( pagedic.Get( PdfName.ANNOTS ) );
// if no annotation ...
if (annotarray == null || annotarray.Size == 0) {
continue;
}
// on each annotation reference...
foreach(PdfIndirectReference annot in annotarray.ArrayList) {
PdfDictionary annotationDic = (PdfDictionary)PdfReader.GetPdfObject( annot );
PdfName subType = (PdfName)annotationDic.Get( PdfName.SUBTYPE );
PdfString contents = annotationDic.GetAsString( PdfName.CONTENTS );
// if simple text...
if ( (contents != null) &&
( (subType.Equals( PdfName.TEXT )) ||
(subType.Equals( PdfName.FREETEXT ))
)
) {
String value = contents.ToString();
// single marker element
cMark mrk = new cMark(cMark.TypeMarker.TypeAnnotation, cMark.TypeAnnotationSubType.TypeAnnotation_NONE);
mrk.pageNum = pg;
mrk.title = value;
if (annotationDic.Get( PdfName.RECT ) != null) {
PdfArray coord = annotationDic.GetAsArray( PdfName.RECT );
PdfRectangle textRect = new PdfRectangle(
((PdfNumber)coord[0]).FloatValue,
((PdfNumber)coord[1]).FloatValue,
((PdfNumber)coord[2]).FloatValue,
((PdfNumber)coord[3]).FloatValue);
mrk.annotRect = textRect.Rectangle;
}
markers.Add( mrk);
}
// if decorated text...
if ( (subType.Equals( PdfName.UNDERLINE )) ||
(subType.Equals( PdfName.HIGHLIGHT )) ||
(subType.Equals( PdfName.STRIKEOUT )) ||
(subType.Equals( PdfName.SQUIGGLY )) ) {
cMark mrk = new cMark(cMark.TypeMarker.TypeAnnotation, cMark.TypeAnnotationSubType.TypeAnnotation_NONE);
mrk.pageNum = pg;
if (subType.Equals( PdfName.UNDERLINE )) {
mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_UNDERLINE;
}
else if (subType.Equals( PdfName.HIGHLIGHT )) {
mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_HIGHLIGHT;
}
else if (subType.Equals( PdfName.STRIKEOUT )) {
mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_STRIKEOUT;
}
else if (subType.Equals( PdfName.SQUIGGLY )) {
mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_SQUIGGLY;
}
PdfObject pdfObjectQuad = annotationDic.Get( PdfName.QUADPOINTS );
if (pdfObjectQuad != null) {
PdfArray rect = annotationDic.GetAsArray( PdfName.QUADPOINTS );
// float llx, float lly, float urx, float ury
float lowX = Math.Min( ((PdfNumber)rect[0]).FloatValue, ((PdfNumber)rect[2]).FloatValue);
lowX = Math.Min( lowX, ((PdfNumber)rect[4]).FloatValue);
lowX = Math.Min( lowX, ((PdfNumber)rect[6]).FloatValue);
float lowY = Math.Min( ((PdfNumber)rect[1]).FloatValue, ((PdfNumber)rect[3]).FloatValue);
lowY = Math.Min( lowY, ((PdfNumber)rect[5]).FloatValue);
lowY = Math.Min( lowY, ((PdfNumber)rect[7]).FloatValue);
float upX = Math.Max( ((PdfNumber)rect[0]).FloatValue, ((PdfNumber)rect[2]).FloatValue);
upX = Math.Max( upX, ((PdfNumber)rect[4]).FloatValue);
upX = Math.Max( upX, ((PdfNumber)rect[6]).FloatValue);
float upY = Math.Max( ((PdfNumber)rect[1]).FloatValue, ((PdfNumber)rect[3]).FloatValue);
upY = Math.Max( upY, ((PdfNumber)rect[5]).FloatValue);
upY = Math.Max( upY, ((PdfNumber)rect[7]).FloatValue);
PdfRectangle textRect = new PdfRectangle( lowX, lowY, upX, upY);
RenderFilter[] filter = { new RegionTextRenderFilter(textRect.Rectangle) };
ITextExtractionStrategy strategy;
StringBuilder sb = new StringBuilder();
for (int i = 1; i <= reader.NumberOfPages; i++) {
strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
}
String result = sb.ToString();
mrk.title = result;
mrk.annotRect = textRect.Rectangle;
markers.Add( mrk);
}
}
}
}
}
}