在PDFBox中查找具有相同字体属性的PDF文档中的文本

时间:2015-03-21 17:14:21

标签: java fonts pdfbox

我是Java和PDFBox的新手。我试图在PDF文档中查找单词,将其字体属性(字体系列,XScale,字体大小等)存储在变量中,然后使用以前存储的值查找具有相同字体属性的其他单词。我该怎么办processTextPosition()来显示String" References"?

的字体属性

这是我的代码:

package pdfinjava;

import java.io.*;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

class reference extends PDFTextStripper {
    static ArrayList<String> boldWordList=new ArrayList<String>();
    String boldWord="";
    String c="";
    public reference() throws IOException {
        super.setSortByPosition(true);
    }
    public static void main(String[] args) throws Exception {
        PDDocument document = null;
        try {
            File input = new File("C:\\abc.pdf");
            document = PDDocument.load(input);
            //  File output=new File("D:/acadmic/7/research/pdf/2.txt");
            PDFTextStripper s=new PDFTextStripper();
            String content = s.getText(document);
            int n= content.indexOf("References");
            String c = content.substring(n, n+10);
            System.out.println(c);

            reference printer = new reference();
            List allPages = document.getDocumentCatalog().getAllPages();
            for (int i = 0; i < allPages.size(); i++) {
                PDPage page = (PDPage) allPages.get(i);
                //System.out.println("Processing page: " + i);

                PDStream contents = page.getContents();
                if (contents != null) {
                    printer.processStream(page, page.findResources(), page.getContents().getStream());
                }
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }

    /**
     * @param text The text to be processed
     */
    @Override 

    protected void processTextPosition(TextPosition text) {    
        if(text.getFont().getFontDescriptor().isForceBold()){
            boldWord+=text;
            //System.out.println(text);

            System.out.println("String["+ text.getXDirAdj() + ","
            + text.getYDirAdj() + " Font Size=" + text.getFontSize()
            + " Font Family=" + text.getFont().getFontDescriptor().getFontName()
            + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() 
            + " space=" + text.getWidthOfSpace() + " width="+ text.getWidthDirAdj() 
            + "]" + text.getCharacter());
        }
    }
}

0 个答案:

没有答案