我是Java和PDFBox的新手。我试图在PDF文档中查找单词,将其字体属性(字体系列,XScale,字体大小等)存储在变量中,然后使用以前存储的值查找具有相同字体属性的其他单词。我该怎么办processTextPosition()
来显示String" References"?
这是我的代码:
package pdfinjava;
import java.io.*;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
class reference extends PDFTextStripper {
static ArrayList<String> boldWordList=new ArrayList<String>();
String boldWord="";
String c="";
public reference() throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args) throws Exception {
PDDocument document = null;
try {
File input = new File("C:\\abc.pdf");
document = PDDocument.load(input);
// File output=new File("D:/acadmic/7/research/pdf/2.txt");
PDFTextStripper s=new PDFTextStripper();
String content = s.getText(document);
int n= content.indexOf("References");
String c = content.substring(n, n+10);
System.out.println(c);
reference printer = new reference();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
//System.out.println("Processing page: " + i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
}
} finally {
if (document != null) {
document.close();
}
}
}
/**
* @param text The text to be processed
*/
@Override
protected void processTextPosition(TextPosition text) {
if(text.getFont().getFontDescriptor().isForceBold()){
boldWord+=text;
//System.out.println(text);
System.out.println("String["+ text.getXDirAdj() + ","
+ text.getYDirAdj() + " Font Size=" + text.getFontSize()
+ " Font Family=" + text.getFont().getFontDescriptor().getFontName()
+ " xscale=" + text.getXScale() + " height=" + text.getHeightDir()
+ " space=" + text.getWidthOfSpace() + " width="+ text.getWidthDirAdj()
+ "]" + text.getCharacter());
}
}
}