我在尝试清理PDF的一部分(文本可搜索)时遇到异常。它只发生在一些PDF上。我注意到差异是文本可能有点偏斜。
我创建了一些PDF注释。将其保存到文件,然后使用清理过程。它就像大多数PDF上的魅力一样。
(itextsharp:5.5.13.0)
提前致谢。
at System.Drawing.Image.FromStream(Stream stream, Boolean useEmbeddedColorManagement, Boolean validateImageData)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpRenderListener.ProcessImage(Byte[] imageBytes, IList`1 areasToBeCleaned)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpRenderListener.RenderImage(ImageRenderInfo renderInfo)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.ImageXObjectDoHandler.HandleXObject(PdfContentStreamProcessor processor, PdfStream xobjectStream, PdfIndirectReference refi, ICollection markedContentInfoStack)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.DisplayXObject(PdfName xobjectName)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.Do.Invoke(PdfContentStreamProcessor processor, PdfLiteral oper, List`1 operands)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpContentOperator.Invoke(PdfContentStreamProcessor pdfContentStreamProcessor, PdfLiteral oper, List`1 operands)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.InvokeOperator(PdfLiteral oper, List`1 operands)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.ProcessContent(Byte[] contentBytes, PdfDictionary resources)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpProcessor.CleanUpPage(Int32 pageNum, IList`1 cleanUpLocations)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpProcessor.CleanUp()
at AU.PDF.Redaction.PdfProcessor.Redact(String inputFilePath, String outputFilePath, IEnumerable`1 options) in C:\Users\Depen585\Documents\POC\Redaction\AU.PDF.Redaction\PdfProcessor.cs:line 123
at Poc.Program.Main(String[] args) in C:\Users\Depen585\Documents\POC\Redaction\Poc\Program.cs:line 91
更新
布鲁诺,使用你的方法我现在得到这个错误(适用于某些人,与其他人一起失败):
at iText.PdfCleanup.PdfCleanUpFilter.FilterImage(ImageRenderInfo image, IList`1 imageAreasToBeCleaned)
at iText.PdfCleanup.PdfCleanUpFilter.FilterImage(FilteredImageKey imageKey)
at iText.PdfCleanup.PdfCleanUpProcessor.CheckIfImageAndClean(IList`1 operands)
at iText.PdfCleanup.PdfCleanUpProcessor.FilterContent(String operator, IList`1 operands)
at iText.PdfCleanup.PdfCleanUpProcessor.InvokeOperator(PdfLiteral operator, IList`1 operands)
at iText.Kernel.Pdf.Canvas.Parser.PdfCanvasProcessor.ProcessContent(Byte[] contentBytes, PdfResources resources)
at iText.PdfCleanup.PdfCleanUpProcessor.ProcessContent(Byte[] contentBytes, PdfResources resources)
at iText.Kernel.Pdf.Canvas.Parser.PdfCanvasProcessor.ProcessPageContent(PdfPage page)
at iText.PdfCleanup.PdfCleanUpProcessor.ProcessPageContent(PdfPage page)
at iText.PdfCleanup.PdfCleanUpTool.CleanUpPage(Int32 pageNumber, IList`1 cleanUpLocations)
at iText.PdfCleanup.PdfCleanUpTool.CleanUp()
at AU.PDF.PdfProcessor.Redact(String inputFilePath, String outputFilePath, String textToRedact) in C:\Users\Depen585\Documents\POC\Redaction\AU.PDF\PdfProcessor.cs:line 38
at Poc.Program.Main(String[] args) in C:\Users\Depen585\Documents\POC\Redaction\Poc\Program.cs:line 91
以下是代码:
public class PdfProcessor
{
public void Initialize(string licenseKeyFilePath)
{
if (string.IsNullOrEmpty(licenseKeyFilePath)) throw new ArgumentNullException(nameof(licenseKeyFilePath));
LicenseKey.LoadLicenseFile(licenseKeyFilePath);
}
public void Redact(string inputFilePath, string outputFilePath, string textToRedact)
{
if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
if (string.IsNullOrEmpty(textToRedact)) throw new ArgumentNullException(nameof(textToRedact));
//CompositeLocationExtractionStrategy strategy = new CompositeLocationExtractionStrategy();
//strategy.add(new PatternLocationExtractionStrategy(textToRedact).setRedactionColor(Color.PINK));
using (PdfReader reader = new PdfReader(inputFilePath))
using (PdfWriter writer = new PdfWriter(outputFilePath))
using (PdfDocument pdf = new PdfDocument(reader, writer))
{
List<PdfCleanUpLocation> cleanUpLocations = new List<PdfCleanUpLocation>
{
new PdfCleanUpLocation(1, new Rectangle(97, 405, 383, 40), iText.Kernel.Colors.ColorConstants.BLACK)
};
new PdfCleanUpTool(pdf, cleanUpLocations).CleanUp();
//PdfAutoSweep autoSweep = new PdfAutoSweep(composite);
//autoSweep.CleanUp(pdf);
}
}
}
更新2:
即使使用自定义策略(RegexBasedLocationExtractionStrategy),它也会在同一PDFS上失败。见下文......
public class RegexLocationExtractionStrategy : RegexBasedLocationExtractionStrategy, ICleanupStrategy
{
private readonly string Regex;
public RegexLocationExtractionStrategy(string regex) : base(regex)
{
Regex = regex ?? throw new ArgumentNullException(nameof(regex));
}
public Color GetRedactionColor(IPdfTextLocation location)
{
return ColorConstants.BLACK;
}
public ICleanupStrategy Reset()
{
return new RegexLocationExtractionStrategy(Regex);
}
}
public class PdfProcessor
{
public void Initialize(string licenseKeyFilePath)
{
if (string.IsNullOrEmpty(licenseKeyFilePath)) throw new ArgumentNullException(nameof(licenseKeyFilePath));
LicenseKey.LoadLicenseFile(licenseKeyFilePath);
}
public void Redact(string inputFilePath, string outputFilePath, string regex)
{
if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
if (string.IsNullOrEmpty(regex)) throw new ArgumentNullException(nameof(regex));
using (PdfReader reader = new PdfReader(inputFilePath))
using (PdfWriter writer = new PdfWriter(outputFilePath))
using (PdfDocument pdf = new PdfDocument(reader, writer))
{
var strategy = new RegexLocationExtractionStrategy(regex);
PdfAutoSweep autoSweep = new PdfAutoSweep(strategy);
autoSweep.CleanUp(pdf);
}
}
}
更新3:
当我使用暂定清理时,我可以看到编辑注释。然后我将清理位置传递给编校例程,它都失败了。见下文。
public IEnumerable<PdfCleanUpLocation> Annotate(string inputFilePath, string outputFilePath, string regex)
{
if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
if (string.IsNullOrEmpty(regex)) throw new ArgumentNullException(nameof(regex));
using (PdfReader reader = new PdfReader(inputFilePath))
using (PdfWriter writer = new PdfWriter(outputFilePath))
using (PdfDocument pdf = new PdfDocument(reader, writer))
{
var strategy = new RegexLocationExtractionStrategy(regex);
var autoSweep = new PdfAutoSweep(strategy);
autoSweep.TentativeCleanUp(pdf);
return autoSweep.GetPdfCleanUpLocations(pdf);
}
}
public void Redact(string inputFilePath, string outputFilePath, IEnumerable<PdfCleanUpLocation> locations)
{
if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
if (locations == null) throw new ArgumentNullException(nameof(locations));
using (PdfReader reader = new PdfReader(inputFilePath))
using (PdfWriter writer = new PdfWriter(outputFilePath))
using (PdfDocument pdf = new PdfDocument(reader, writer))
{
var cleanUpTool = new PdfCleanUpTool(pdf);
locations
.ToList()
.ForEach(location => cleanUpTool.AddCleanupLocation(location));
cleanUpTool.CleanUp();
}
}