我对PDF有一个棘手的要求
我需要在pdf中搜索特定的字符串 - 属性编号:
每次找到此内容时,我都需要在上面添加分页符
我可以访问IText和Spire.PDF,我首先看IText
我已在此处的其他帖子中建立了我需要使用PDF Stamper
下面的逻辑添加了一个可行的新页面
但是,就我而言,我只需要分页而不是空白页
var newFile = @"c:\temp\full.pdf";
var dest = @"c:\temp\dest.pdf";
var reader = new PdfReader(newFile);
if (File.Exists(dest))
{
File.Delete(dest);
}
var stamper = new PdfStamper(reader, new FileStream(dest, FileMode.CreateNew));
var total = reader.NumberOfPages + 1;
for (var pageNumber = total; pageNumber > 0; pageNumber--)
{
var pageContent = reader.GetPageContent(pageNumber);
stamper.InsertPage(pageNumber, PageSize.A4);
}
stamper.Close();
reader.Close();
下图显示了一个示例,因此这实际上是3页,即现有页面,在第一次出现的属性编号上方插入了新的分页符:
第二次出现时需要另一个分页符
答案 0 :(得分:1)
本回答分享了使用iText和Java在PDF中查找所有特定文本并在上面插入分页符的概念验证。将它移植到iTextSharp和C#应该不会太困难。
此外,对于生产用途,必须添加一些额外的代码,因为当前代码做出一些假设,例如假定非旋转页面。此外,它根本不处理注释。
该任务实际上是两个任务的组合,查找和插入分页符,因此我们需要
要提取自定义文本的位置,我们扩展iText LocationTextExtractionStrategy
以允许提取自定义文本文本字符串的位置,实际上是正则表达式的匹配项:
public class SearchTextLocationExtractionStrategy extends LocationTextExtractionStrategy {
public SearchTextLocationExtractionStrategy(Pattern pattern) {
super(new TextChunkLocationStrategy() {
public TextChunkLocation createLocation(TextRenderInfo renderInfo, LineSegment baseline) {
// while baseLine has been changed to not neutralize
// effects of rise, ascentLine and descentLine explicitly
// have not: We want the actual positions.
return new AscentDescentTextChunkLocation(baseline, renderInfo.getAscentLine(),
renderInfo.getDescentLine(), renderInfo.getSingleSpaceWidth());
}
});
this.pattern = pattern;
}
static Field locationalResultField = null;
static Method filterTextChunksMethod = null;
static Method startsWithSpaceMethod = null;
static Method endsWithSpaceMethod = null;
static Field textChunkTextField = null;
static Method textChunkSameLineMethod = null;
static {
try {
locationalResultField = LocationTextExtractionStrategy.class.getDeclaredField("locationalResult");
locationalResultField.setAccessible(true);
filterTextChunksMethod = LocationTextExtractionStrategy.class.getDeclaredMethod("filterTextChunks",
List.class, TextChunkFilter.class);
filterTextChunksMethod.setAccessible(true);
startsWithSpaceMethod = LocationTextExtractionStrategy.class.getDeclaredMethod("startsWithSpace",
String.class);
startsWithSpaceMethod.setAccessible(true);
endsWithSpaceMethod = LocationTextExtractionStrategy.class.getDeclaredMethod("endsWithSpace", String.class);
endsWithSpaceMethod.setAccessible(true);
textChunkTextField = TextChunk.class.getDeclaredField("text");
textChunkTextField.setAccessible(true);
textChunkSameLineMethod = TextChunk.class.getDeclaredMethod("sameLine", TextChunk.class);
textChunkSameLineMethod.setAccessible(true);
} catch (NoSuchFieldException | SecurityException | NoSuchMethodException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public Collection<TextRectangle> getLocations(TextChunkFilter chunkFilter) {
Collection<TextRectangle> result = new ArrayList<>();
try {
List<TextChunk> filteredTextChunks = (List<TextChunk>) filterTextChunksMethod.invoke(this,
locationalResultField.get(this), chunkFilter);
Collections.sort(filteredTextChunks);
StringBuilder sb = new StringBuilder();
List<AscentDescentTextChunkLocation> locations = new ArrayList<>();
TextChunk lastChunk = null;
for (TextChunk chunk : filteredTextChunks) {
String chunkText = (String) textChunkTextField.get(chunk);
if (lastChunk == null) {
// Nothing to compare with at the end
} else if ((boolean) textChunkSameLineMethod.invoke(chunk, lastChunk)) {
// we only insert a blank space if the trailing character of the previous string
// wasn't a space,
// and the leading character of the current string isn't a space
if (isChunkAtWordBoundary(chunk, lastChunk)
&& !((boolean) startsWithSpaceMethod.invoke(this, chunkText))
&& !((boolean) endsWithSpaceMethod.invoke(this, chunkText))) {
sb.append(' ');
LineSegment spaceBaseLine = new LineSegment(lastChunk.getEndLocation(),
chunk.getStartLocation());
locations.add(new AscentDescentTextChunkLocation(spaceBaseLine, spaceBaseLine, spaceBaseLine,
chunk.getCharSpaceWidth()));
}
} else {
assert sb.length() == locations.size();
Matcher matcher = pattern.matcher(sb);
while (matcher.find()) {
int i = matcher.start();
Vector baseStart = locations.get(i).getStartLocation();
TextRectangle textRectangle = new TextRectangle(matcher.group(), baseStart.get(Vector.I1),
baseStart.get(Vector.I2));
for (; i < matcher.end(); i++) {
AscentDescentTextChunkLocation location = locations.get(i);
textRectangle.add(location.getAscentLine().getBoundingRectange());
textRectangle.add(location.getDescentLine().getBoundingRectange());
}
result.add(textRectangle);
}
sb.setLength(0);
locations.clear();
}
sb.append(chunkText);
locations.add((AscentDescentTextChunkLocation) chunk.getLocation());
lastChunk = chunk;
}
} catch (IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return result;
}
@Override
public void renderText(TextRenderInfo renderInfo) {
for (TextRenderInfo info : renderInfo.getCharacterRenderInfos())
super.renderText(info);
}
public static class AscentDescentTextChunkLocation extends TextChunkLocationDefaultImp {
public AscentDescentTextChunkLocation(LineSegment baseLine, LineSegment ascentLine, LineSegment descentLine,
float charSpaceWidth) {
super(baseLine.getStartPoint(), baseLine.getEndPoint(), charSpaceWidth);
this.ascentLine = ascentLine;
this.descentLine = descentLine;
}
public LineSegment getAscentLine() {
return ascentLine;
}
public LineSegment getDescentLine() {
return descentLine;
}
final LineSegment ascentLine;
final LineSegment descentLine;
}
public class TextRectangle extends Rectangle2D.Float {
public TextRectangle(final String text, final float xStart, final float yStart) {
super(xStart, yStart, 0, 0);
this.text = text;
}
public String getText() {
return text;
}
final String text;
}
final Pattern pattern;
}
(SearchTextLocationExtractionStrategy.java)
由于基类的某些必要成员是私有的或包私有的,我们必须使用反射来提取它们。
此工具的网页拆分功能是从this answer的PdfVeryDenseMergeTool
中提取的。此外,允许自定义分页位置是抽象的。
public abstract class AbstractPdfPageSplittingTool {
public AbstractPdfPageSplittingTool(Rectangle size, float top) {
this.pageSize = size;
this.topMargin = top;
}
public void split(OutputStream outputStream, PdfReader... inputs) throws DocumentException, IOException {
try {
openDocument(outputStream);
for (PdfReader reader : inputs) {
split(reader);
}
} finally {
closeDocument();
}
}
void openDocument(OutputStream outputStream) throws DocumentException {
final Document document = new Document(pageSize, 36, 36, topMargin, 36);
final PdfWriter writer = PdfWriter.getInstance(document, outputStream);
document.open();
this.document = document;
this.writer = writer;
newPage();
}
void closeDocument() {
try {
document.close();
} finally {
this.document = null;
this.writer = null;
this.yPosition = 0;
}
}
void newPage() {
document.newPage();
yPosition = pageSize.getTop(topMargin);
}
void split(PdfReader reader) throws IOException {
for (int page = 1; page <= reader.getNumberOfPages(); page++) {
split(reader, page);
}
}
void split(PdfReader reader, int page) throws IOException
{
PdfImportedPage importedPage = writer.getImportedPage(reader, page);
PdfContentByte directContent = writer.getDirectContent();
yPosition = pageSize.getTop();
Rectangle pageSizeToImport = reader.getPageSize(page);
float[] borderPositions = determineSplitPositions(reader, page);
if (borderPositions == null || borderPositions.length < 2)
return;
for (int borderIndex = 0; borderIndex + 1 < borderPositions.length; borderIndex++) {
float height = borderPositions[borderIndex] - borderPositions[borderIndex + 1];
if (height <= 0)
continue;
directContent.saveState();
directContent.rectangle(0, yPosition - height, pageSizeToImport.getWidth(), height);
directContent.clip();
directContent.newPath();
writer.getDirectContent().addTemplate(importedPage, 0, yPosition - (borderPositions[borderIndex] - pageSizeToImport.getBottom()));
directContent.restoreState();
newPage();
}
}
protected abstract float[] determineSplitPositions(PdfReader reader, int page);
Document document = null;
PdfWriter writer = null;
float yPosition = 0;
final Rectangle pageSize;
final float topMargin;
}
(AbstractPdfPageSplittingTool.java)
实现OP的任务:
我需要在pdf中搜索特定的字符串 - 属性编号:
每次找到此内容时,我都需要在上面添加分页符
可以像上面这样使用上面的类:
AbstractPdfPageSplittingTool tool = new AbstractPdfPageSplittingTool(PageSize.A4, 36) {
@Override
protected float[] determineSplitPositions(PdfReader reader, int page) {
Collection<TextRectangle> locations = Collections.emptyList();
try {
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
SearchTextLocationExtractionStrategy strategy = new SearchTextLocationExtractionStrategy(
Pattern.compile("Property Number"));
parser.processContent(page, strategy, Collections.emptyMap()).getResultantText();
locations = strategy.getLocations(null);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
List<Float> borders = new ArrayList<>();
for (TextRectangle rectangle : locations)
{
borders.add((float)rectangle.getMaxY());
}
Rectangle pageSize = reader.getPageSize(page);
borders.add(pageSize.getTop());
borders.add(pageSize.getBottom());
Collections.sort(borders, Collections.reverseOrder());
float[] result = new float[borders.size()];
for (int i=0; i < result.length; i++)
result[i] = borders.get(i);
return result;
}
};
tool.split(new FileOutputStream(RESULT), new PdfReader(SOURCE));
(SplitPages.java测试方法testSplitDocumentAboveAngestellter
)