我有几个PDF文件,使用Windows应用程序(C#),我需要找出PDF文件是否有重叠文本。我怎么能这样做,是否有任何免费的第三方DLL来实现这一目标?
我现在拥有的是第三方DLL,它可以从PDF获取文本/图像。
我的PDF文件和图片都很丰富。这里,一行文本打印在另一行的顶部,或者一些文本打印在一些图像的顶部。需要找到这种重叠。
正如您在图像中看到的那样,由于边界框重叠以及字形轮廓重叠,可能会发生重叠。因此需要找到PDF中的这两种情况。我的PDF不包含任何注释。因此,重叠仅发生在pdf的内容中。我们不会将穷人的大胆技术用于更胖的字形,如果发生这种情况,那么它应该被视为重叠。
PDF中不会有任何透明图像,只有我们可能拥有的图像是页面底部的徽标或数字签名,任何与此重叠的文本都应视为重叠。
不是从图像(扫描)创建PDF。从一些文本编辑器创建它。
答案 0 :(得分:1)
它可能就像上面的示例一样简单,或者您必须为此实现自己的阅读器。
如果您无法完全控制PDF文件,则无法解决问题。稍后可以转换定义的框。所以你必须解析整个文件,也要跟踪盒子的位置和形式。另外,一些框可能位于其他框的顶部,但在像素级别上没有任何碰撞渲染。
比你遇到下一个问题。每个PDF实现都有不同的错误。因此,您的系统可能会完美呈现文本,而不是客户的打印机。
欢迎来到地狱;)
每个支持人员会告诉您他们遵守标准。其他人必须实现他们的PDF库错误。因为您的客户数据会充满信心,所以您无法证明他们的错误。您可能会发现测试数据存在一些错误,但客户文档中的错误从未发生过相同的错误。
只要您还没有成为公司的PDF专家,就可以运行和隐藏。
这是一个肮脏的"一般"方法:在位图中不使用文本渲染文本。使用另一个位图中的文本呈现页面,将该区域与文本进行比较。但这需要单色背景。但负荷会非常高。但是这个文件看起来像一个表格。创建表单并填写表单框。所以你没有问题,你甚至会得到正确的结果,用另一个程序填写表格
答案 1 :(得分:1)
OP在评论中澄清:
由于边界框重叠以及字形轮廓重叠,可能会发生重叠。因此,需要找到PDF中的这两种情况。
每当字形轮廓本身重叠时,它们的边界框也会重叠。
因此,足以检查重叠的边界框。
我们可能只有图片底部的徽标或数字签名,任何与此重叠的文字都应视为重叠。
因此,对于文本重叠图像,我们无需检查图像中的空白区域是否重叠。
我的PDF文件没有任何注释。
因此,我们只需要检查页面内容(包括从页面内容引用的表单xobjects的内容,允许递归)。
此外,OP仅提及文字和图像。因此,我们可以忽略矢量图形。
由于我更喜欢Java,我首先在Java中创建了一个概念验证,然后将其移植到.Net。
对于Java和.Net,行动方式都是相同的:
事件监听器可能如下所示:
class OverlappingTextSearchingStrategy : IEventListener
{
static List<Vector> UNIT_SQUARE_CORNERS = new List<Vector> { new Vector(0, 0, 1), new Vector(1, 0, 1), new Vector(1, 1, 1), new Vector(0, 1, 1) };
ICollection<Rectangle> imageRectangles = new HashSet<Rectangle>();
ICollection<Rectangle> textRectangles = new HashSet<Rectangle>();
public void EventOccurred(IEventData data, EventType type)
{
if (data is ImageRenderInfo) {
ImageRenderInfo imageData = (ImageRenderInfo)data;
Matrix ctm = imageData.GetImageCtm();
List<Rectangle> cornerRectangles = new List<Rectangle>(UNIT_SQUARE_CORNERS.Count);
foreach (Vector unitCorner in UNIT_SQUARE_CORNERS)
{
Vector corner = unitCorner.Cross(ctm);
cornerRectangles.Add(new Rectangle(corner.Get(Vector.I1), corner.Get(Vector.I2), 0, 0));
}
Rectangle boundingBox = Rectangle.GetCommonRectangle(cornerRectangles.ToArray());
Console.WriteLine("Adding image bounding rectangle {0}.", boundingBox);
imageRectangles.Add(boundingBox);
} else if (data is TextRenderInfo) {
TextRenderInfo textData = (TextRenderInfo)data;
Rectangle ascentRectangle = textData.GetAscentLine().GetBoundingRectangle();
Rectangle descentRectangle = textData.GetDescentLine().GetBoundingRectangle();
Rectangle boundingBox = Rectangle.GetCommonRectangle(ascentRectangle, descentRectangle);
if (boundingBox.GetHeight() == 0 || boundingBox.GetWidth() == 0)
Console.WriteLine("Ignoring empty text bounding rectangle {0} for \"{1}\".", boundingBox, textData.GetText());
else
{
Console.WriteLine("Adding text bounding rectangle {0} for \"{1}\" with 0.5 margins.", boundingBox, textData.GetText());
textRectangles.Add(boundingBox.ApplyMargins<Rectangle>(0.5f, 0.5f, 0.5f, 0.5f, false));
}
} else if (data is PathRenderInfo) {
// TODO
} else if (data != null)
{
Console.WriteLine("Ignored {0} event, class {1}.", type, data.GetType().Name);
}
else
{
Console.WriteLine("Ignored {0} event with null data.", type);
}
}
public ICollection<EventType> GetSupportedEvents()
{
// Support all events
return null;
}
public bool foundOverlappingText()
{
bool result = false;
List<Rectangle> textRectangleList = new List<Rectangle>(textRectangles);
while (textRectangleList.Count > 0)
{
Rectangle testRectangle = textRectangleList[textRectangleList.Count - 1];
textRectangleList.RemoveAt(textRectangleList.Count - 1);
foreach (Rectangle rectangle in textRectangleList)
{
if (intersect(testRectangle, rectangle))
{
Console.WriteLine("Found text intersecting text with bounding boxes {0} at {1},{2} and {3} at {4},{5}.",
testRectangle, testRectangle.GetX(), testRectangle.GetY(), rectangle, rectangle.GetX(), rectangle.GetY());
result = true;// if only the fact counts, do instead: return true
}
}
foreach (Rectangle rectangle in imageRectangles)
{
if (intersect(testRectangle, rectangle))
{
Console.WriteLine("Found text intersecting image with bounding boxes {0} at {1},{2} and {3} at {4},{5}.",
testRectangle, testRectangle.GetX(), testRectangle.GetY(), rectangle, rectangle.GetX(), rectangle.GetY());
result = true;// if only the fact counts, do instead: return true
}
}
}
return result;
}
bool intersect(Rectangle a, Rectangle b)
{
return intersect(a.GetLeft(), a.GetRight(), b.GetLeft(), b.GetRight()) &&
intersect(a.GetBottom(), a.GetTop(), b.GetBottom(), b.GetTop());
}
bool intersect(float start1, float end1, float start2, float end2)
{
if (start1 < start2)
return start2 <= end1;
else
return start1 <= end2;
}
}
此事件侦听器可以像这样使用:
PdfReader reader = new PdfReader(pdf);
PdfDocument document = new PdfDocument(reader);
PdfDocumentContentParser contentParser = new PdfDocumentContentParser(document);
OverlappingTextSearchingStrategy strategy = contentParser.ProcessContent(page, new OverlappingTextSearchingStrategy());
bool foundOverlaps = strategy.foundOverlappingText();
事件监听器可能如下所示:
public class OverlappingTextSearchingStrategy implements IEventListener {
static List<Vector> UNIT_SQUARE_CORNERS = Arrays.asList(new Vector(0,0,1), new Vector(1,0,1), new Vector(1,1,1), new Vector(0,1,1));
Set<Rectangle> imageRectangles = new HashSet<>();
Set<Rectangle> textRectangles = new HashSet<>();
@Override
public void eventOccurred(IEventData data, EventType type) {
if (data instanceof ImageRenderInfo) {
ImageRenderInfo imageData = (ImageRenderInfo) data;
Matrix ctm = imageData.getImageCtm();
List<Rectangle> cornerRectangles = new ArrayList<>(UNIT_SQUARE_CORNERS.size());
for (Vector unitCorner : UNIT_SQUARE_CORNERS) {
Vector corner = unitCorner.cross(ctm);
cornerRectangles.add(new Rectangle(corner.get(Vector.I1), corner.get(Vector.I2), 0, 0));
}
Rectangle boundingBox = Rectangle.getCommonRectangle(cornerRectangles.toArray(new Rectangle[cornerRectangles.size()]));
logger.info(String.format("Adding image bounding rectangle %s.", boundingBox));
imageRectangles.add(boundingBox);
} else if (data instanceof TextRenderInfo) {
TextRenderInfo textData = (TextRenderInfo) data;
Rectangle ascentRectangle = textData.getAscentLine().getBoundingRectangle();
Rectangle descentRectangle = textData.getDescentLine().getBoundingRectangle();
Rectangle boundingBox = Rectangle.getCommonRectangle(ascentRectangle, descentRectangle);
if (boundingBox.getHeight() == 0 || boundingBox.getWidth() == 0)
logger.info(String.format("Ignoring empty text bounding rectangle %s for '%s'.", boundingBox, textData.getText()));
else {
logger.info(String.format("Adding text bounding rectangle %s for '%s' with 0.5 margins.", boundingBox, textData.getText()));
textRectangles.add(boundingBox.applyMargins(0.5f, 0.5f, 0.5f, 0.5f, false));
}
} else if (data instanceof PathRenderInfo) {
// TODO: vector graphics
} else if (data != null) {
logger.fine(String.format("Ignored %s event, class %s.", type, data.getClass().getSimpleName()));
} else {
logger.fine(String.format("Ignored %s event with null data.", type));
}
}
@Override
public Set<EventType> getSupportedEvents() {
// Support all events
return null;
}
public boolean foundOverlappingText() {
boolean result = false;
List<Rectangle> textRectangleList = new ArrayList<>(textRectangles);
while (!textRectangleList.isEmpty())
{
Rectangle testRectangle = textRectangleList.remove(textRectangleList.size() - 1);
for (Rectangle rectangle : textRectangleList) {
if (intersect(testRectangle, rectangle)) {
logger.info(String.format("Found text intersecting text with bounding boxes %s at %s,%s and %s at %s,%s.",
testRectangle, testRectangle.getX(), testRectangle.getY(), rectangle, rectangle.getX(), rectangle.getY()));
result = true;// if only the fact counts, do instead: return true
}
}
for (Rectangle rectangle : imageRectangles) {
if (intersect(testRectangle, rectangle)) {
logger.info(String.format("Found text intersecting image with bounding boxes %s at %s,%s and %s at %s,%s.",
testRectangle, testRectangle.getX(), testRectangle.getY(), rectangle, rectangle.getX(), rectangle.getY()));
result = true;// if only the fact counts, do instead: return true
}
}
}
return result;
}
boolean intersect(Rectangle a, Rectangle b) {
return intersect(a.getLeft(), a.getRight(), b.getLeft(), b.getRight()) &&
intersect(a.getBottom(), a.getTop(), b.getBottom(), b.getTop());
}
boolean intersect(float start1, float end1, float start2, float end2) {
if (start1 < start2)
return start2 <= end1;
else
return start1 <= end2;
}
Logger logger = Logger.getLogger(OverlappingTextSearchingStrategy.class.getName());
}
此事件侦听器可以像这样使用:
PdfReader reader = new PdfReader(pdf);
PdfDocument document = new PdfDocument(reader);
PdfDocumentContentParser contentParser = new PdfDocumentContentParser(document);
OverlappingTextSearchingStrategy strategy = contentParser.processContent(pageNumber, new OverlappingTextSearchingStrategy());
boolean foundOverlaps = strategy.foundOverlappingText();
正如您所看到的,我不会按原样存储文本边界框,而是
boundingBox.applyMargins(0.5f, 0.5f, 0.5f, 0.5f, false),
即。略小的盒子。这样做是为了防止误报,否则可能会出现紧密设置的文本或应用了字距调整的文本。您可能需要在此处微调边距值。
答案 2 :(得分:0)
您好我有一个不使用免费库的代码示例,但我认为其他库应该具有类似的功能,因此您可以将其用作想法: 在使用以下代码示例之前,请确保使用最新版本的Apitron PDF Kit。
using System;
using System.Collections.Generic;
using System.IO;
using Apitron.PDF.Kit.FixedLayout;
using Apitron.PDF.Kit.FixedLayout.Content;
using Apitron.PDF.Kit.FixedLayout.PageProperties;
using FixedLayout.Resources;
using FixedLayout.ContentElements;
/// <summary>
/// Gets all text boundaries.
/// </summary>
/// <param name="elements">The elements.</param>
/// <param name="boundaries">The boundaries.</param>
public void GetAllTextBoundaries(IContentElementsEnumerator elements, IList<Boundary> boundaries, Boundary offset)
{
// We dont count drawings and images here - only text;
if(elements == null)
{
return;
}
foreach (IContentElement element in elements)
{
TextContentElement text = element as TextContentElement;
if (text != null)
{
foreach (TextSegment segment in text.Segments)
{
Boundary currentBoundary = segment.Boundary;
if (offset != null)
{
currentBoundary = new Boundary(currentBoundary.Left + offset.Left, currentBoundary.Bottom + offset.Bottom, currentBoundary.Right + offset.Left, currentBoundary.Top + offset.Bottom);
}
boundaries.Add(currentBoundary);
}
}
else if (element is FormContentElement)
{
Boundary currentBoundary = (element as FormContentElement).Boundary;
if (offset != null)
{
currentBoundary = new Boundary(currentBoundary.Left + offset.Left, currentBoundary.Bottom + offset.Bottom, currentBoundary.Right + offset.Left, currentBoundary.Top + offset.Bottom);
}
this.GetAllTextBoundaries((element as FormContentElement).FormXObject.Elements, boundaries, currentBoundary);
}
}
}
/// <summary>
/// Checks if text is overlapped.
/// </summary>
/// <returns></returns>
public bool CheckIfTextIsOverlapped(string fileName)
{
const double overlapMax = 5;
using (System.IO.Stream stream = new FileStream(fileName, FileMode.Open, FileAccess.ReadWrite))
{
using (FixedDocument document = new FixedDocument(stream))
{
foreach (Page page in document.Pages)
{
IList<Boundary> boundaries = new List<Boundary>();
foreach (Annotation annotation in page.Annotations)
{
// Actually we need only Normal state, but will check all - to be sure.
if(annotation.Appearance.Normal != null)
{
this.GetAllTextBoundaries(annotation.Appearance.Normal.Elements, boundaries, annotation.Boundary);
}
}
IContentElementsEnumerator elements = page.Elements;
this.GetAllTextBoundaries(elements, boundaries, null);
for (int i = 0; i < boundaries.Count; i++)
{
for (int j = i + 1; j < boundaries.Count; j++)
{
Boundary b1 = boundaries[i];
Boundary b2 = boundaries[j];
double x1 = Math.Max(b1.Left, b2.Left);
double y1 = Math.Max(b1.Bottom, b2.Bottom);
double x2 = Math.Min(b1.Right, b2.Right);
double y2 = Math.Min(b1.Top, b2.Top);
// So we have intersection
if (x1 < x2 && y1 < y2)
{
if (x1 - x2 >= overlapMax || y1 - y2 >= overlapMax)
{
return true;
}
}
}
}
}
}
}
return false;
}