Question

我在尝试清理PDF的一部分（文本可搜索）时遇到异常。它只发生在一些PDF上。我注意到差异是文本可能有点偏斜。

我创建了一些PDF注释。将其保存到文件，然后使用清理过程。它就像大多数PDF上的魅力一样。

（itextsharp：5.5.13.0）

提前致谢。

   at System.Drawing.Image.FromStream(Stream stream, Boolean useEmbeddedColorManagement, Boolean validateImageData)
   at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpRenderListener.ProcessImage(Byte[] imageBytes, IList`1 areasToBeCleaned)
   at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpRenderListener.RenderImage(ImageRenderInfo renderInfo)
   at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.ImageXObjectDoHandler.HandleXObject(PdfContentStreamProcessor processor, PdfStream xobjectStream, PdfIndirectReference refi, ICollection markedContentInfoStack)
   at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.DisplayXObject(PdfName xobjectName)
   at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.Do.Invoke(PdfContentStreamProcessor processor, PdfLiteral oper, List`1 operands)
   at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpContentOperator.Invoke(PdfContentStreamProcessor pdfContentStreamProcessor, PdfLiteral oper, List`1 operands)
   at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.InvokeOperator(PdfLiteral oper, List`1 operands)
   at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.ProcessContent(Byte[] contentBytes, PdfDictionary resources)
   at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpProcessor.CleanUpPage(Int32 pageNum, IList`1 cleanUpLocations)
   at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpProcessor.CleanUp()
   at AU.PDF.Redaction.PdfProcessor.Redact(String inputFilePath, String outputFilePath, IEnumerable`1 options) in C:\Users\Depen585\Documents\POC\Redaction\AU.PDF.Redaction\PdfProcessor.cs:line 123
   at Poc.Program.Main(String[] args) in C:\Users\Depen585\Documents\POC\Redaction\Poc\Program.cs:line 91

更新

布鲁诺，使用你的方法我现在得到这个错误（适用于某些人，与其他人一起失败）：

   at iText.PdfCleanup.PdfCleanUpFilter.FilterImage(ImageRenderInfo image, IList`1 imageAreasToBeCleaned)
   at iText.PdfCleanup.PdfCleanUpFilter.FilterImage(FilteredImageKey imageKey)
   at iText.PdfCleanup.PdfCleanUpProcessor.CheckIfImageAndClean(IList`1 operands)
   at iText.PdfCleanup.PdfCleanUpProcessor.FilterContent(String operator, IList`1 operands)
   at iText.PdfCleanup.PdfCleanUpProcessor.InvokeOperator(PdfLiteral operator, IList`1 operands)
   at iText.Kernel.Pdf.Canvas.Parser.PdfCanvasProcessor.ProcessContent(Byte[] contentBytes, PdfResources resources)
   at iText.PdfCleanup.PdfCleanUpProcessor.ProcessContent(Byte[] contentBytes, PdfResources resources)
   at iText.Kernel.Pdf.Canvas.Parser.PdfCanvasProcessor.ProcessPageContent(PdfPage page)
   at iText.PdfCleanup.PdfCleanUpProcessor.ProcessPageContent(PdfPage page)
   at iText.PdfCleanup.PdfCleanUpTool.CleanUpPage(Int32 pageNumber, IList`1 cleanUpLocations)
   at iText.PdfCleanup.PdfCleanUpTool.CleanUp()
   at AU.PDF.PdfProcessor.Redact(String inputFilePath, String outputFilePath, String textToRedact) in C:\Users\Depen585\Documents\POC\Redaction\AU.PDF\PdfProcessor.cs:line 38
   at Poc.Program.Main(String[] args) in C:\Users\Depen585\Documents\POC\Redaction\Poc\Program.cs:line 91

以下是代码：

public class PdfProcessor
{
    public void Initialize(string licenseKeyFilePath)
    {
        if (string.IsNullOrEmpty(licenseKeyFilePath)) throw new ArgumentNullException(nameof(licenseKeyFilePath));

        LicenseKey.LoadLicenseFile(licenseKeyFilePath);
    }

    public void Redact(string inputFilePath, string outputFilePath, string textToRedact)
    {
        if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
        if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
        if (string.IsNullOrEmpty(textToRedact)) throw new ArgumentNullException(nameof(textToRedact));

        //CompositeLocationExtractionStrategy strategy = new CompositeLocationExtractionStrategy();

        //strategy.add(new PatternLocationExtractionStrategy(textToRedact).setRedactionColor(Color.PINK));

        using (PdfReader reader = new PdfReader(inputFilePath))
        using (PdfWriter writer = new PdfWriter(outputFilePath))
        using (PdfDocument pdf = new PdfDocument(reader, writer))
        {
            List<PdfCleanUpLocation> cleanUpLocations = new List<PdfCleanUpLocation>
            {
                new PdfCleanUpLocation(1, new Rectangle(97, 405, 383, 40), iText.Kernel.Colors.ColorConstants.BLACK)
            };

            new PdfCleanUpTool(pdf, cleanUpLocations).CleanUp();

            //PdfAutoSweep autoSweep = new PdfAutoSweep(composite);

            //autoSweep.CleanUp(pdf);
        }
    }
}

更新2：

即使使用自定义策略（RegexBasedLocationExtractionStrategy），它也会在同一PDFS上失败。见下文......

public class RegexLocationExtractionStrategy : RegexBasedLocationExtractionStrategy, ICleanupStrategy
{
    private readonly string Regex;

    public RegexLocationExtractionStrategy(string regex) : base(regex)
    {
        Regex = regex ?? throw new ArgumentNullException(nameof(regex));
    }

    public Color GetRedactionColor(IPdfTextLocation location)
    {
        return ColorConstants.BLACK;
    }

    public ICleanupStrategy Reset()
    {
        return new RegexLocationExtractionStrategy(Regex);
    }
}

public class PdfProcessor
{
    public void Initialize(string licenseKeyFilePath)
    {
        if (string.IsNullOrEmpty(licenseKeyFilePath)) throw new ArgumentNullException(nameof(licenseKeyFilePath));

        LicenseKey.LoadLicenseFile(licenseKeyFilePath);
    }

    public void Redact(string inputFilePath, string outputFilePath, string regex)
    {
        if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
        if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
        if (string.IsNullOrEmpty(regex)) throw new ArgumentNullException(nameof(regex));

        using (PdfReader reader = new PdfReader(inputFilePath))
        using (PdfWriter writer = new PdfWriter(outputFilePath))
        using (PdfDocument pdf = new PdfDocument(reader, writer))
        {
            var strategy = new RegexLocationExtractionStrategy(regex);

            PdfAutoSweep autoSweep = new PdfAutoSweep(strategy);

            autoSweep.CleanUp(pdf);
        }
    }
}

更新3：

当我使用暂定清理时，我可以看到编辑注释。然后我将清理位置传递给编校例程，它都失败了。见下文。

public IEnumerable<PdfCleanUpLocation> Annotate(string inputFilePath, string outputFilePath, string regex)
{
    if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
    if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
    if (string.IsNullOrEmpty(regex)) throw new ArgumentNullException(nameof(regex));

    using (PdfReader reader = new PdfReader(inputFilePath))
    using (PdfWriter writer = new PdfWriter(outputFilePath))
    using (PdfDocument pdf = new PdfDocument(reader, writer))
    {
        var strategy = new RegexLocationExtractionStrategy(regex);
        var autoSweep = new PdfAutoSweep(strategy);

        autoSweep.TentativeCleanUp(pdf);

        return autoSweep.GetPdfCleanUpLocations(pdf);
    }
}


public void Redact(string inputFilePath, string outputFilePath, IEnumerable<PdfCleanUpLocation> locations)
{
    if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
    if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
    if (locations == null) throw new ArgumentNullException(nameof(locations));

    using (PdfReader reader = new PdfReader(inputFilePath))
    using (PdfWriter writer = new PdfWriter(outputFilePath))
    using (PdfDocument pdf = new PdfDocument(reader, writer))
    {
        var cleanUpTool = new PdfCleanUpTool(pdf);

        locations
            .ToList()
            .ForEach(location => cleanUpTool.AddCleanupLocation(location));

        cleanUpTool.CleanUp();
    }
}

清理进程抛出异常（itextsharp）

0 个答案: