我需要从PDF文档中删除文本。我正在使用Aspose
我目前正在使用TextFragmentAbsorber
。
仅供参考,我不能使用任何其他第三方库。
以下是我正在使用的代码:
private string DeleteMachineReadableCode(string inputFilePath)
{
var outputFilePath = Path.Combine(Path.GetTempPath(), string.Format(@"{0}.pdf", Guid.NewGuid()));
try
{
// Open document
Document pdfDocument = new Document(inputFilePath);
// Create TextAbsorber object to find all the phrases matching the regular expression
TextFragmentAbsorber textFragmentAbsorber = new TextFragmentAbsorber("#START#((.|\r\n)*?)#END#");
// Set text search option to specify regular expression usage
TextSearchOptions textSearchOptions = new TextSearchOptions(true);
textFragmentAbsorber.TextSearchOptions = textSearchOptions;
// Accept the absorber for all pages
pdfDocument.Pages.Accept(textFragmentAbsorber);
// Get the extracted text fragments
TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments;
// Loop through the fragments
foreach (TextFragment textFragment in textFragmentCollection)
{
// Update text and other properties
textFragment.Text = string.Empty;
// Set to an instance of an object.
textFragment.TextState.Font = FontRepository.FindFont("Verdana");
textFragment.TextState.FontSize = 1;
textFragment.TextState.ForegroundColor = Aspose.Pdf.Color.FromRgb(System.Drawing.Color.White);
textFragment.TextState.BackgroundColor = Aspose.Pdf.Color.FromRgb(System.Drawing.Color.White);
}
pdfDocument.Save(outputFilePath);
}
finally
{
if (File.Exists(inputFilePath))
File.Delete(inputFilePath);
}
return outputFilePath;
}
如果要删除的内容在一个页面上,我可以替换内容。
我的问题是,如果文本跨越多个页面,TextFragmentAbsorber将无法识别带有上述正则表达式模式的文本(“#START#((.|\r\n)*?)#END#
”)。
请建议是否可以在正则表达式上执行任何操作,或者Aspose中的某些设置可以解决我的问题。
答案 0 :(得分:1)
如前所述,由于体系结构的限制,我们无法承诺您之前报告的问题的解决方案。但是,我们修改了代码段以满足您的要求。
我们的想法是在其中一个文档页面上找到从“#START#”开始的文本。然后在后续页面上找到以“#END#”结尾的文本。并且还处理在这两个页面之间放置在页面上的所有文本片段(如果存在)。
private string DeleteMachineReadableCodeUpdated(string inputFilePath)
{
string outputFilePath = Path.Combine(Path.GetTempPath(), string.Format(@"{0}.pdf", Guid.NewGuid()));
try
{
// Open document
Document pdfDocument = new Document(inputFilePath);
// Create TextAbsorber object to find all the phrases matching the regular expression
TextFragmentAbsorber absorber = new TextFragmentAbsorber("#START#((.|\r\n)*?)#END#");
// Set text search option to specify regular expression usage
TextSearchOptions textSearchOptions = new TextSearchOptions(true);
absorber.TextSearchOptions = textSearchOptions;
// Accept the absorber for all pages
pdfDocument.Pages.Accept(absorber);
// Get the extracted text fragments
TextFragmentCollection textFragmentCollection = absorber.TextFragments;
// If pattern found on one of the pages
if (textFragmentCollection.Count > 0)
{
RemoveTextFromFragmentCollection(textFragmentCollection);
}
else
{
// In case nothing was found tries to find by parts
string startingPattern = "#START#((.|\r\n)*?)\\z";
string endingPattern = "\\A((.|\r\n)*?)#END#";
bool isStartingPatternFound = false;
bool isEndingPatternFound = false;
ArrayList fragmentsToRemove = new ArrayList();
foreach (Page page in pdfDocument.Pages)
{
// If ending pattern was already found - do nothing
if (isEndingPatternFound)
continue;
// If starting pattern was already found - activate textFragmentAbsorber with ending pattern
absorber.Phrase = !isStartingPatternFound ? startingPattern : endingPattern;
page.Accept(absorber);
if (absorber.TextFragments.Count > 0)
{
// In case something is found - add it to list
fragmentsToRemove.AddRange(absorber.TextFragments);
if (isStartingPatternFound)
{
// Both starting and ending patterns found - the document processing
isEndingPatternFound = true;
RemoveTextFromFragmentCollection(fragmentsToRemove);
}
else
{
// Only starting pattern found yet - continue
isStartingPatternFound = true;
}
}
else
{
// In case neither starting nor ending pattern are found on current page
// If starting pattern was found previously - get all fragments from the page
if (isStartingPatternFound)
{
absorber.Phrase = String.Empty;
page.Accept(absorber);
fragmentsToRemove.AddRange(absorber.TextFragments);
}
// Otherwise do nothing (continue)
}
}
}
pdfDocument.Save(outputFilePath);
}
finally
{
if (File.Exists(inputFilePath))
File.Delete(inputFilePath);
}
return outputFilePath;
}
private void RemoveTextFromFragmentCollection(ICollection fragmentCollection)
{
// Loop through the fragments
foreach (TextFragment textFragment in fragmentCollection)
{
textFragment.Text = string.Empty;
}
}
注意: