Question

我需要从PDF文档中删除文本。我正在使用Aspose 我目前正在使用TextFragmentAbsorber。

仅供参考，我不能使用任何其他第三方库。

以下是我正在使用的代码：

private string DeleteMachineReadableCode(string inputFilePath)
    {
        var outputFilePath = Path.Combine(Path.GetTempPath(), string.Format(@"{0}.pdf", Guid.NewGuid()));

        try
        {
            // Open document
            Document pdfDocument = new Document(inputFilePath);

            // Create TextAbsorber object to find all the phrases matching the regular expression
            TextFragmentAbsorber textFragmentAbsorber = new TextFragmentAbsorber("#START#((.|\r\n)*?)#END#"); 

            // Set text search option to specify regular expression usage
            TextSearchOptions textSearchOptions = new TextSearchOptions(true);


            textFragmentAbsorber.TextSearchOptions = textSearchOptions;

            // Accept the absorber for all pages
            pdfDocument.Pages.Accept(textFragmentAbsorber);

            // Get the extracted text fragments
            TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments;

            // Loop through the fragments
            foreach (TextFragment textFragment in textFragmentCollection)
            {
                // Update text and other properties
                textFragment.Text = string.Empty;

                // Set to an instance of an object.
                textFragment.TextState.Font = FontRepository.FindFont("Verdana");
                textFragment.TextState.FontSize = 1;
                textFragment.TextState.ForegroundColor = Aspose.Pdf.Color.FromRgb(System.Drawing.Color.White);
                textFragment.TextState.BackgroundColor = Aspose.Pdf.Color.FromRgb(System.Drawing.Color.White);
            }

            pdfDocument.Save(outputFilePath);

        }
        finally
        {
            if (File.Exists(inputFilePath))
                File.Delete(inputFilePath);
        }

        return outputFilePath;
    }

如果要删除的内容在一个页面上，我可以替换内容。我的问题是，如果文本跨越多个页面，TextFragmentAbsorber将无法识别带有上述正则表达式模式的文本（“#START#((.|\r\n)*?)#END#”）。

请建议是否可以在正则表达式上执行任何操作，或者Aspose中的某些设置可以解决我的问题。

Answer 1

如前所述，由于体系结构的限制，我们无法承诺您之前报告的问题的解决方案。但是，我们修改了代码段以满足您的要求。

我们的想法是在其中一个文档页面上找到从“#START＃”开始的文本。然后在后续页面上找到以“＃END＃”结尾的文本。并且还处理在这两个页面之间放置在页面上的所有文本片段（如果存在）。

    private string DeleteMachineReadableCodeUpdated(string inputFilePath)
    {
    string outputFilePath = Path.Combine(Path.GetTempPath(), string.Format(@"{0}.pdf", Guid.NewGuid()));

try
{
    // Open document
    Document pdfDocument = new Document(inputFilePath);

    // Create TextAbsorber object to find all the phrases matching the regular expression
    TextFragmentAbsorber absorber = new TextFragmentAbsorber("#START#((.|\r\n)*?)#END#");

    // Set text search option to specify regular expression usage
    TextSearchOptions textSearchOptions = new TextSearchOptions(true);

    absorber.TextSearchOptions = textSearchOptions;

    // Accept the absorber for all pages
    pdfDocument.Pages.Accept(absorber);

    // Get the extracted text fragments
    TextFragmentCollection textFragmentCollection = absorber.TextFragments;

    // If pattern found on one of the pages
    if (textFragmentCollection.Count > 0)
    {
        RemoveTextFromFragmentCollection(textFragmentCollection);
    }
    else
    {
        // In case nothing was found tries to find by parts
        string startingPattern = "#START#((.|\r\n)*?)\\z";
        string endingPattern = "\\A((.|\r\n)*?)#END#";
        bool isStartingPatternFound = false;
        bool isEndingPatternFound = false;
        ArrayList fragmentsToRemove = new ArrayList();

        foreach (Page page in pdfDocument.Pages)
        {
            // If ending pattern was already found - do nothing
            if (isEndingPatternFound)
                continue;

            // If starting pattern was already found - activate textFragmentAbsorber with ending pattern
            absorber.Phrase = !isStartingPatternFound ? startingPattern : endingPattern;

            page.Accept(absorber);
            if (absorber.TextFragments.Count > 0)
            {
                // In case something is found - add it to list
                fragmentsToRemove.AddRange(absorber.TextFragments);

                if (isStartingPatternFound)
                {
                    // Both starting and ending patterns found - the document processing
                    isEndingPatternFound = true;                        
                    RemoveTextFromFragmentCollection(fragmentsToRemove);
                }
                else
                {
                    // Only starting pattern found yet - continue
                    isStartingPatternFound = true;                        
                }
            }
            else
            {
                // In case neither starting nor ending pattern are found on current page
                // If starting pattern was found previously - get all fragments from the page
                if (isStartingPatternFound)
                {
                    absorber.Phrase = String.Empty;
                    page.Accept(absorber);
                    fragmentsToRemove.AddRange(absorber.TextFragments);
                }
                // Otherwise do nothing (continue)
            }
        }
    }

    pdfDocument.Save(outputFilePath);
}
finally
{
    if (File.Exists(inputFilePath))
        File.Delete(inputFilePath);
}

return outputFilePath;
}

private void RemoveTextFromFragmentCollection(ICollection fragmentCollection)
{
// Loop through the fragments
foreach (TextFragment textFragment in fragmentCollection)
{
    textFragment.Text = string.Empty;
}
}

注意：

此代码假定从“#START＃”开始并以“＃END＃”结尾的唯一一个文本块位于文档中。但是，可以轻松修改上述代码以处理多个块。
您可以在保存文档之前使用pdfDocument.Pages.Delete（pageNumber）存储页码，而不是在中间页面上处理文本。它可以避免“空白”页面，如果它们不合需要的话。

使用Aspose.PDF库从PDF文档中删除文本？

1 个答案: