我正在运行PDFBox提供的示例以获取每个TextPosition的宽度/高度。当我通过一页pdf时,它给我准确的结果。但如果我使用多页pdf,我的身高会不正确。
这是我做的实验,我拿了5页pdf并作为参数传入(每个TextPosition的高度错误)。接下来,我使用MacOSX Preview将相同的pdf分成5个单页pdf,并逐个传递每个页面(我得到正确的高度)。
package printtextlocations;
import java.io.*;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;
import java.util.List;
public class PrintTextLocations extends PDFTextStripper {
public PrintTextLocations() throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args) throws Exception {
PDDocument document = null;
try {
File input = new File("C:\\path\\to\\PDF.pdf");
document = PDDocument.load(input);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintTextLocations printer = new PrintTextLocations();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
System.out.println("Processing page: " + i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
}
} finally {
if (document != null) {
document.close();
}
}
}
/**
* @param text The text to be processed
*/
@Override
protected void processTextPosition(TextPosition text) {
System.out.println(" String [x: " + text.getXDirAdj() + ", y: "
+ text.getY() + ", height:" + text.getHeightDir()
+ ", space: " + text.getWidthOfSpace() + ", width: "
+ text.getWidthDirAdj() + ", yScale: " + text.getYScale() + "]"
+ text.getCharacter());
}
}
输出摘要 - 5页pdf
String [x:58.500004,y:692.2,身高:33.480003,空间:2.64,宽度:6.635998,yScale:12.0] 6
String [x:58.6,y:741.2,身高:33.480003,空间:2.64,宽度:6.6360016,yScale:12.0] 1
String [x:58.6,y:753.4,身高:33.480003,空间:2.64,宽度:6.6360016,yScale:12.0] 2
输出片段 - 1页pdfs
String [x:58.5,y:692.2,身高:5.55,空间:2.64,宽度:6.6480026,yScale:12.0] 6
String [x:58.6,y:741.2,身高:5.55,空间:2.64,宽度:6.6480026,yScale:12.0] 1
String [x:58.6,y:753.4,身高:5.55,空间:2.64,宽度:6.6480026,yScale:12.0] 2
有谁知道为什么我们在这种情况下会得到不一致的结果?我有什么设置吗?
感谢您的帮助。
这是另一个测试文件 wrong height pdf - 3 pages 在这里我得到的输出
String [x:90.0,y:83.28003,height:33.480003,space:5.8497605,width:7.248001,yScale:12.0] V
String [x:97.242,y:83.28003,height:33.480003,space:5.8497605,width:5.856003,yScale:12.0] e
String [x:103.095604,y:83.28003,height:33.480003,space:5.8497605,width:4.9680023,yScale:12.0] r
String [x:108.0588,y:83.28003,height:33.480003,space:5.8497605,width:6.0479965,yScale:12.0] y
String [x:116.748,y:83.28003,height:33.480003,space:5.8497605,width:5.9520035,yScale:12.0] S
String [x:122.7012,y:83.28003,身高:33.480003,空间:5.8497605,宽度:3.3359985,yScale:12.0]我
String [x:126.034805,y:83.28003,身高:33.480003,空间:5.8497605,宽度:9.983994,yScale:12.0] m
String [x:136.01881,y:83.28003,height:33.480003,space:5.8497605,width:6.671997,yScale:12.0] p
String [x:142.6932,y:83.28003,身高:33.480003,空间:5.8497605,宽度:3.251999,yScale:12.0] l
String [x:145.9512,y:83.28003,身高:33.480003,空间:5.8497605,宽度:5.856003,yScale:12.0] e
String [x:154.4472,y:83.28003,身高:33.480003,空间:5.8497605,宽度:7.9440002,yScale:12.0] D
String [x:162.38641,y:83.28003,height:33.480003,space:5.8497605,width:6.371994,yScale:12.0] o
String [x:168.75601,y:83.28003,height:33.480003,space:5.8497605,width:5.2920074,yScale:12.0] c String [x:174.0468,y:83.28003,身高:33.480003,空间:5.8497605,宽度:6.624008,yScale:12.0] u String [x:180.6732,y:83.28003,身高:33.480003,空间:5.8497605,宽度:9.983994,yScale:12.0] m String [x:190.6572,y:83.28003,身高:33.480003,空间:5.8497605,宽度:5.856003,yScale:12.0] e String [x:196.5108,y:83.28003,身高:33.480003,空间:5.8497605,宽度:6.695999,yScale:12.0] n String [x:203.20801,y:83.28003,身高:33.480003,空间:5.8497605,宽度:4.0559998,yScale:12.0] t 完成处理页面0 完成添加第0页 String [x:90.0,y:139.44,身高:33.480003,空间:5.8497605,宽度:6.816002,yScale:12.0] P
String [x:96.8148,y:139.44,height:33.480003,space:5.8497605,width:5.856003,yScale:12.0] a
String [x:102.6696,y:139.44,height:33.480003,space:5.8497605,width:5.9280014,yScale:12.0] g
String [x:108.5964,y:139.44,height:33.480003,space:5.8497605,width:5.856003,yScale:12.0] e
String [x:117.090004,y:139.44,height:33.480003,space:5.8497605,width:6.6480026,yScale:12.0] 2
String [x:126.375595,y:139.44,height:33.480003,space:5.8497605,width:6.371994,yScale:12.0] o
String [x:132.7464,y:139.44,height:33.480003,space:5.8497605,width:3.6360016,yScale:12.0] f
String [x:139.0312,y:139.44,height:33.480003,space:5.8497605,width:9.983994,yScale:12.0] m
String [x:149.0152,y:139.44,height:33.480003,space:5.8497605,width:3.3359985,yScale:12.0] i
String [x:152.3488,y:139.44,height:33.480003,space:5.8497605,width:6.695999,yScale:12.0] n
String [x:159.046,y:139.44,height:33.480003,space:5.8497605,width:3.3359985,yScale:12.0] i
String [x:162.37961,y:139.44,height:33.480003,space:5.8497605,width:9.983994,yScale:12.0] m
String [x:172.3636,y:139.44,height:33.480003,space:5.8497605,width:5.856003,yScale:12.0] a
String [x:178.2232,y:139.44,height:33.480003,space:5.8497605,width:3.251999,yScale:12.0] l
String [x:181.4812,y:139.44,height:33.480003,space:5.8497605,width:3.3359985,yScale:12.0] i
String [x:184.8148,y:139.44,height:33.480003,space:5.8497605,width:5.1600037,yScale:12.0] s
String [x:189.9712,y:139.44,height:33.480003,space:5.8497605,width:9.983994,yScale:12.0] m
完成处理第1页 完成添加第1页 String [x:90.0,y:266.15997,身高:33.480003,空间:5.8497605,宽度:6.816002,yScale:12.0] P
String [x:96.8148,y:266.15997,身高:33.480003,空间:5.8497605,宽度:5.856003,yScale:12.0] a
String [x:102.6696,y:266.15997,身高:33.480003,空间:5.8497605,宽度:5.9280014,yScale:12.0] g
String [x:108.5964,y:266.15997,身高:33.480003,空间:5.8497605,宽度:5.856003,yScale:12.0] e
String [x:117.090004,y:266.15997,身高:33.480003,空间:5.8497605,宽度:6.6480026,yScale:12.0] 3
String [x:126.375595,y:266.15997,身高:33.480003,空间:5.8497605,宽度:6.371994,yScale:12.0] o
String [x:132.7464,y:266.15997,身高:33.480003,空间:5.8497605,宽度:7.548004,yScale:12.0] K
String [x:140.3052,y:266.15997,身高:33.480003,空间:5.8497605,宽度:5.856003,yScale:12.0] a
String [x:146.16,y:266.15997,身高:33.480003,空间:5.8497605,宽度:6.048004,yScale:12.0] y
String [x:152.2068,y:266.15997,身高:33.480003,空间:5.8497605,宽度:5.0639954,yScale:12.0]?
完成处理第2页 完成添加第2页
答案 0 :(得分:3)
在确定解析后的字形的高度时(使用相关字体对象的getFontHeight
方法),PDFBox首先检查它是否具有手头各个字形的字体指标。它只知道AFM类型1字体指标;因为您的字体是真正的字体,因此,PDFBox没有这样的指标。
在这种情况下,它继续尝试从字体描述符中检索常规字体度量。文档中字体的字体描述符如下所示:
21 0 obj <<
/Type /FontDescriptor
/FontName /GLDXOZ+Cambria
/Flags 4
/FontBBox [-1475 -2463 2867 3117]
/ItalicAngle 0
/Ascent 950
/Descent -222
/CapHeight 667
/StemV 0
/XHeight 467
/AvgWidth 615
/MaxWidth 2919
/FontFile2 24 0 R
>>
endobj
它检查的第一个描述符条目是字体边界框( / FontBBox 条目),如果它存在,则它的高度是平均字体高度的一半。
在你的情况下,与字体中的字形相比,字体边界框非常大;垂直从-2463到3117 !!
另一方面,大写字母高度( / CapHeight 条目,大写字母顶部的垂直坐标,从基线测量)仅为667和上升( / Ascent ,此字体中字形到达基线以上的最大高度;重音字符的字形高度不包括)仅950.这真的让我想知道为什么那个字体有这样的字体边框...
如果没有字体边界框,PDFBox接下来会尝试使用大写字母高度,然后是上升,最后是 / XHeight - / Descent 。这些中的每一个都会产生合理的值,但是由于存在边界框,PDFBox会假定值过大。
有问题的代码被注释为
// the following values are all more or less accurate
// at least all are average values. Maybe we'll find
// another way to get those value for every single glyph
// in the future if needed
虽然我不知道为什么PDFBox更喜欢猜测边界框的平均高度,而不是像上升,它不是唯一的软件,假设您的字体中的文字是巨大的。例如,如果您使用Adobe Acrobat的文本修饰工具,则会看到:
垂直条是光标!所以Acrobat也认为字体很大。
遗憾的是,您没有通过拆分MacOSX Preview提供从您的示例创建的单页pdfs。因此,我不知道为什么你之后会得到更多的真实信息。但显然,预览会以某种方式更改字体信息,因为巨大高度值的原因与具有多个页面或仅有一个页面的文档无关。