Question

我正在使用基于PDFBox的工具来索引PDF文件，并将其与正确的数据一起发送到Solr。我正在尝试做的是让PDFBox获得PDF的页码并将其索引到Solr。当我在Solr中搜索特定的关键字时，我需要创建一种方法，使结果中也包含我的结果所在的页码。因此，我将需要获取PDF的页码。

public static void main(String[] args) throws Exception {

        String sdir = null;
        String dir = null;
        long pollingInterval = 10000;

        PrintStream startupOut = System.out;
        CommandLine cl = getCommandLine(startupOut, args);
        if (cl == null) {
            System.out.println("System exit. Your arguments was invalid. Please try again.");
            return;
        }

        if (!cl.hasOption('s')) {
            System.out.println("System exit. Your arguments was invalid. Please try again.");
            return;
        }

        System.out.println("Welcome to use this program! This program is running as a service. To stop ctrl+c.");
        System.out.println("Monitoring is started");

        String dOption = cl.getOptionValue('d');
        if (cl.hasOption('d')) {
            dir = dOption;
        }

        if(dir != null) {

            if(dir.lastIndexOf("/") < dir.length() - 1)
                dir += "/";

            AnalysePdf.analysePdfFolder(dir);

            SolrClient solrClient = new HttpSolrClient.Builder(solrServerUrl).build();

            FileUtil.allFiles.clear();
            List<File> htmlfiles = FileUtil.getFiles(new File(dir));

            if (htmlfiles.size() > 0) {

                for (File file : htmlfiles) {

                    if (!FileUtil.getFileType(file).equalsIgnoreCase("htm"))
                        continue;

                    Document doc = Jsoup.parse(file, "UTF-8");
                    Elements headers = doc.select("p.header");

                    BufferedReader br1 = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
                    StringBuilder sb = new StringBuilder();

                    String line = null;
                    try {

                        while ((line = br1.readLine()) != null) {
                            sb.append(line.trim());
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    } finally {
                        try {
                            br1.close();
                        } catch (IOException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                    }
                    String content = sb.toString();
                    String fileName = file.getName().substring(0, file.getName().lastIndexOf("."));
                    String year = fileName.substring(0, 4);
                    String month = fileName.substring(5, 7);
                    String version = "";

                    int len = fileName.length();
                    for (int i = 1; i < len; i++) {
                        if (fileName.substring(len - i, len - i + 1).equalsIgnoreCase("v")) {
                            version = fileName.substring(len - i);
                            break;
                        }
                    }

                    for (Element header : headers) {
                        String headerValue = header.text().trim();

                        if (!headerValue.equals("") && !headerValue.startsWith("http")) {

                            Element headerElement = header.nextElementSibling();
                            String text = headerElement.text();
                            if (headerElement.nextElementSibling() != null) {
                                if (headerElement.nextElementSibling().hasClass("text"))
                                    text += headerElement.nextElementSibling().text();
                            }

                            // Preparing the Solr document
                            SolrInputDocument solrDoc = new SolrInputDocument();

                            // Adding fields to the document
                            solrDoc.addField("content", content);
                            solrDoc.addField("paragraph_header", header.text());
                            solrDoc.addField("paragraph_txt", text);
                            solrDoc.addField("url", dir + file.getName().replaceAll("htm", "pdf"));
                            solrDoc.addField("year", year);
                            solrDoc.addField("month", month);
                            solrDoc.addField("version", version);

                            if(!version.equals(""))
                                solrDoc.addField("sorting", year + "_" + month + "_" + version);
                            else
                                solrDoc.addField("sorting", year + "_" + month);

                            solrDoc.addField("stream_size", file.length());
                            solrDoc.addField("resource_name", file.getName().replaceAll("htm", "pdf"));
                            solrDoc.addField("content_type", "pdf");

                            // Adding the document to Solr
                            solrClient.add(solrDoc);
                        }
                    }
                    if(file.exists())
                        file.delete();
                }
            }

            System.out.println("Commiting...");

            // Saving the changes
            solrClient.commit();

            System.out.println("Completed!...");

        } else {

            String sOption = cl.getOptionValue('s');
            if (cl.hasOption('s')) {
                sdir = String.valueOf(sOption);
                if(sdir.lastIndexOf("/") < sdir.length() - 1)
                    sdir += "/";
                rootDir = sdir;
            }

            if(sdir != null) {
                File parentDir = new File(sdir).getParentFile();
                if(parentDir.exists())
                {
                    tmpDir = new File(parentDir.getAbsolutePath() + File.separator + "tmp");
                    if(!tmpDir.exists())
                        tmpDir.mkdir();
                }
            }

            String tOption = cl.getOptionValue('t');
            if (cl.hasOption('t')) {
                pollingInterval = Long.valueOf(tOption);
            }

            File folder = new File(sdir);

            if (!folder.exists()) {
                // Test to see if monitored folder exists
                throw new RuntimeException("Directory not found: " + sdir);
            }

            Set<String> modifiedPaths = new HashSet<String>();
            List<File> modifiedFileList = new ArrayList<File>();
            Set<String> deletedPaths = new HashSet<String>();
            List<File> deletedFileList = new ArrayList<File>();

            FileAlterationObserver observer = new FileAlterationObserver(folder);
            FileAlterationMonitor monitor = new FileAlterationMonitor(pollingInterval);
            FileAlterationListenerAdaptor listener = new FileAlterationListenerAdaptor() {
                // Is triggered when a file is created in the monitored folder
                @Override
                public void onFileCreate(File file) {

                    // "file" is the reference to the newly created file
                    try {
                        if(modifiedPaths.add(file.getCanonicalPath()))
                            modifiedFileList.add(file);
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

                // Is triggered when a file is created in the monitored folder
                @Override
                public void onFileChange(File file) {

                    // "file" is the reference to the newly created file
                    try {
                        if(modifiedPaths.add(file.getCanonicalPath()))
                            modifiedFileList.add(file);
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

                // Is triggered when a file is deleted from the monitored folder
                @Override
                public void onFileDelete(File file) {
                    try {
                        if(deletedPaths.add(file.getCanonicalPath()))
                            deletedFileList.add(file);
                    } catch (IOException e) {
                        e.printStackTrace(System.err);
                    }
                }

AnalysePdf代码：

package org.solr.extract.pdf;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.List;

/**
 * 
 *
 *
 */
public class AnalysePdf {

    /**
     * 
     * @param pdffolder
     * @throws IOException
     */
    public static void analysePdfFolder(String pdffolder) throws IOException {
        String htmout = "";

        File dir = new File(pdffolder);

        if (dir.isDirectory() == false) {
            System.out.println("Pdf Dir does not exists : " + pdffolder);
            return;
        }

        // list out all the file name and filter by the extension
        String[] list = dir.list();

        if (list.length == 0) {
            System.out.println("Pdf Dir no files end with : " + pdffolder);
            return;
        }

        for (String pdffilename : list) {

            if (!pdffilename.endsWith(".pdf"))
                continue;

            int dotpos = pdffilename.lastIndexOf(".");

            String pdfsubname = pdffilename.substring(0, dotpos);

            String pdfpagepath = pdffolder + pdfsubname + File.separator;

            String pdffullname = pdffolder + pdffilename;

            htmout = pdffullname.replace(".pdf", ".htm");

            if (new File(htmout).exists())
                continue;

            // Split pages of one pdf

            ReadPdf.Splitepages(pdffullname, pdfpagepath);

            // Extract subpages
            ReadPdf.AnalysisFolder(pdfpagepath);

            // Merge subpages to one htm
            ReadPdf.MergePages(htmout, pdfpagepath);

            FileUtils.forceDelete(new File(pdfpagepath));

        }
    }

    /**
     * 
     * @param file_list
     * @throws IOException
     */
    public static void analyseListPdf(List<File> list, String tmpDir) throws IOException {

        String htmout = "";

        if (list.size() == 0) {
            System.out.println("No files");     
            return;
        }

        for (File file : list) {

            if (!file.getAbsolutePath().endsWith(".pdf"))
                continue;

            // create splited pdfs in tmpDir\pdfname directory
            String pdfpagepath = tmpDir + File.separator + file.getName().substring(0, file.getName().lastIndexOf(".")) + File.separator;

            // create html files in tempDir directory
            htmout = tmpDir + File.separator + file.getName().replace(".pdf", ".htm");

            if (new File(htmout).exists())
                continue;

            // Split pages of one pdf
            ReadPdf.Splitepages(file.getAbsolutePath(), pdfpagepath);

            // Extract subpages
            ReadPdf.AnalysisFolder(pdfpagepath);

            // Merge subpages to one htm
            ReadPdf.MergePages(htmout, pdfpagepath);

            FileUtils.forceDelete(new File(pdfpagepath));

        }
    }
}

如何使用PDFBox获取pdf页码

0 个答案: