Solr和PDFBox如何索引PDF文件以具有页码并获取url链接,例如hlocalhost://something/file.pdf#page=4

时间:2018-08-08 09:11:14

标签: pdf solr pdfbox

我正在建立一个索引PDF文件的系统和一个搜索应用程序。 我正在使用Solr 7.4.0和PDFBox作为索引工具。 我遇到了一个问题,就是我需要以这种方式索引PDF文件才能拥有页码。这是因为我想要实现的是,当我搜索关键字并获取结果时,我还希望看到指向PDF文件中关键字位置的链接。我想要类似http://myserver/files/selectedfile.pdf#page=5

的东西

因此,用户单击链接并看到关键字在PDF中的页面。我不明白如何实现这一目标。我需要与关键字及其在PDF中的位置相关。

到目前为止,我所做的是构建基于PDFBox的工具,并在主要的java和AnalysePdf.java下共享该工具,如果需要更多信息,可以理解该问题,我可以在此处发布或向您收录完整代码的工具。< / p>

package org.solr.index.pdf.jakub;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.monitor.FileAlterationListenerAdaptor;
import org.apache.commons.io.monitor.FileAlterationMonitor;
import org.apache.commons.io.monitor.FileAlterationObserver;
import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrInputDocument;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.solr.extract.pdf.AnalysePdf;

/**
 * 
 * 
 * 
 *
 */
public class Monitor {

    private static String solrServerUrl = "http://localhost:8983/solr/docs"; /* URL to Solr + Core Name -> change docs to your core name if need it*/
    private static File tmpDir = null;
    private static String rootDir = null;

    public Monitor() {}

    private static void usage(PrintStream out, String[] args) {
        HelpFormatter hf = new HelpFormatter();
        hf.printHelp("IdxWatchService", options());
        out.println("Your arguments were: " + StringUtils.join(args, ' '));
    }

    private static Options options() {
        Options options = new Options();
        options.addOption("s", "service", true, "The Watching Directory which included pdf files.");
        options.addOption("d", "dir", true, "The Root Directory which included pdf files.");
        options.addOption("t", "time", true, "The interval time (ms).");
        return options;
    }

    private static CommandLine getCommandLine(PrintStream out, String[] args) {
        CommandLineParser clp = new GnuParser();
        CommandLine cl;
        try {
            cl = clp.parse(options(), args);
        } catch (ParseException e) {
            usage(out, args);
            return null;
        }

        if (cl.getOptions().length == 0) {
            usage(out, args);
            return null;
        }

        return cl;
    }

    public static void main(String[] args) throws Exception {

        String sdir = null;
        String dir = null;
        long pollingInterval = 10000;

        PrintStream startupOut = System.out;
        CommandLine cl = getCommandLine(startupOut, args);
        if (cl == null) {
            System.out.println("System exit. Your arguments was invalid. Please try again.");
            return;
        }

        if (!cl.hasOption('s')) {
            System.out.println("System exit. Your arguments was invalid. Please try again.");
            return;
        }

        System.out.println("Welcome to use this program! This program is running as a service. To stop ctrl+c.");
        System.out.println("Monitoring is started");

        String dOption = cl.getOptionValue('d');
        if (cl.hasOption('d')) {
            dir = dOption;
        }

        if(dir != null) {

            if(dir.lastIndexOf("/") < dir.length() - 1)
                dir += "/";

            AnalysePdf.analysePdfFolder(dir);

            SolrClient solrClient = new HttpSolrClient.Builder(solrServerUrl).build();

            FileUtil.allFiles.clear();
            List<File> htmlfiles = FileUtil.getFiles(new File(dir));

            if (htmlfiles.size() > 0) {

                for (File file : htmlfiles) {

                    if (!FileUtil.getFileType(file).equalsIgnoreCase("htm"))
                        continue;

                    Document doc = Jsoup.parse(file, "UTF-8");
                    Elements headers = doc.select("p.header");

                    BufferedReader br1 = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
                    StringBuilder sb = new StringBuilder();

                    String line = null;
                    try {

                        while ((line = br1.readLine()) != null) {
                            sb.append(line.trim());
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    } finally {
                        try {
                            br1.close();
                        } catch (IOException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                    }
                    String content = sb.toString();
                    String fileName = file.getName().substring(0, file.getName().lastIndexOf("."));
                    String year = fileName.substring(0, 4);
                    String month = fileName.substring(5, 7);
                    String version = "";

                    int len = fileName.length();
                    for (int i = 1; i < len; i++) {
                        if (fileName.substring(len - i, len - i + 1).equalsIgnoreCase("v")) {
                            version = fileName.substring(len - i);
                            break;
                        }
                    }

                    for (Element header : headers) {
                        String headerValue = header.text().trim();

                        if (!headerValue.equals("") && !headerValue.startsWith("http")) {

                            Element headerElement = header.nextElementSibling();
                            String text = headerElement.text();
                            if (headerElement.nextElementSibling() != null) {
                                if (headerElement.nextElementSibling().hasClass("text"))
                                    text += headerElement.nextElementSibling().text();
                            }

                            // Preparing the Solr document
                            SolrInputDocument solrDoc = new SolrInputDocument();

                            // Adding fields to the document
                            solrDoc.addField("content", content);
                            solrDoc.addField("paragraph_header", header.text());
                            solrDoc.addField("paragraph_txt", text);
                            solrDoc.addField("url", dir + file.getName().replaceAll("htm", "pdf"));
                            solrDoc.addField("year", year);
                            solrDoc.addField("month", month);
                            solrDoc.addField("version", version);

                            if(!version.equals(""))
                                solrDoc.addField("sorting", year + "_" + month + "_" + version);
                            else
                                solrDoc.addField("sorting", year + "_" + month);

                            solrDoc.addField("stream_size", file.length());
                            solrDoc.addField("resource_name", file.getName().replaceAll("htm", "pdf"));
                            solrDoc.addField("content_type", "pdf");

                            // Adding the document to Solr
                            solrClient.add(solrDoc);
                        }
                    }
                    if(file.exists())
                        file.delete();
                }
            }

            System.out.println("Commiting...");

            // Saving the changes
            solrClient.commit();

            System.out.println("Completed!...");

        } else {

            String sOption = cl.getOptionValue('s');
            if (cl.hasOption('s')) {
                sdir = String.valueOf(sOption);
                if(sdir.lastIndexOf("/") < sdir.length() - 1)
                    sdir += "/";
                rootDir = sdir;
            }

            if(sdir != null) {
                File parentDir = new File(sdir).getParentFile();
                if(parentDir.exists())
                {
                    tmpDir = new File(parentDir.getAbsolutePath() + File.separator + "tmp");
                    if(!tmpDir.exists())
                        tmpDir.mkdir();
                }
            }

            String tOption = cl.getOptionValue('t');
            if (cl.hasOption('t')) {
                pollingInterval = Long.valueOf(tOption);
            }

            File folder = new File(sdir);

            if (!folder.exists()) {
                // Test to see if monitored folder exists
                throw new RuntimeException("Directory not found: " + sdir);
            }

            Set<String> modifiedPaths = new HashSet<String>();
            List<File> modifiedFileList = new ArrayList<File>();
            Set<String> deletedPaths = new HashSet<String>();
            List<File> deletedFileList = new ArrayList<File>();

            FileAlterationObserver observer = new FileAlterationObserver(folder);
            FileAlterationMonitor monitor = new FileAlterationMonitor(pollingInterval);
            FileAlterationListenerAdaptor listener = new FileAlterationListenerAdaptor() {
                // Is triggered when a file is created in the monitored folder
                @Override
                public void onFileCreate(File file) {

                    // "file" is the reference to the newly created file
                    try {
                        if(modifiedPaths.add(file.getCanonicalPath()))
                            modifiedFileList.add(file);
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

                // Is triggered when a file is created in the monitored folder
                @Override
                public void onFileChange(File file) {

                    // "file" is the reference to the newly created file
                    try {
                        if(modifiedPaths.add(file.getCanonicalPath()))
                            modifiedFileList.add(file);
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

                // Is triggered when a file is deleted from the monitored folder
                @Override
                public void onFileDelete(File file) {
                    try {
                        if(deletedPaths.add(file.getCanonicalPath()))
                            deletedFileList.add(file);
                    } catch (IOException e) {
                        e.printStackTrace(System.err);
                    }
                }

                @Override
                public void onStop(FileAlterationObserver observer) {

                    SolrClient solrClient = new HttpSolrClient.Builder(solrServerUrl).build();

                    try {
                        if(modifiedFileList.size() > 0) {

                            AnalysePdf.analyseListPdf(modifiedFileList, tmpDir.getAbsolutePath());

                            FileUtil.allFiles.clear();
                            List<File> htmlfiles = FileUtil.getFiles(new File(tmpDir.getAbsolutePath()));

                            if (htmlfiles.size() > 0) {

                                for (File file : htmlfiles) {

                                    if (!FileUtil.getFileType(file).equalsIgnoreCase("htm"))
                                        continue;

                                    Document doc = Jsoup.parse(file, "UTF-8");
                                    Elements headers = doc.select("p.header");

                                    BufferedReader br1 = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
                                    StringBuilder sb = new StringBuilder();

                                    String line = null;
                                    try {

                                        while ((line = br1.readLine()) != null) {
                                            sb.append(line.trim());
                                        }
                                    } catch (IOException e) {
                                        e.printStackTrace();
                                    } finally {
                                        try {
                                            br1.close();
                                        } catch (IOException e) {
                                        // TODO Auto-generated catch block
                                            e.printStackTrace();
                                        }
                                    }
                                    String content = sb.toString();
                                    String fileName = file.getName().substring(0, file.getName().lastIndexOf("."));
                                    String year = fileName.substring(0, 4);
                                    String month = fileName.substring(5, 7);
                                    String version = "";
                                    int len = fileName.length();
                                    for (int i = 1; i < len; i++) {
                                        if (fileName.substring(len - i, len - i + 1).equalsIgnoreCase("v")) {
                                            version = fileName.substring(len - i);
                                                break;
                                        }
                                    }

                                    for (Element header : headers) {
                                        String headerValue = header.text().trim();

                                        if (!headerValue.equals("") && !headerValue.startsWith("http")) {

                                            Element headerElement = header.nextElementSibling();
                                            String text = headerElement.text();
                                            if (headerElement.nextElementSibling() != null) {
                                                if (headerElement.nextElementSibling().hasClass("text"))
                                                    text += headerElement.nextElementSibling().text();
                                            }

                                            // Preparing the Solr document
                                            SolrInputDocument solrDoc = new SolrInputDocument();

                                            // Adding fields to the document
                                            solrDoc.addField("content", content);
                                            solrDoc.addField("paragraph_header", header.text());
                                            solrDoc.addField("paragraph_txt", text);
                                            solrDoc.addField("url", rootDir + file.getName().replaceAll("htm", "pdf"));
                                            solrDoc.addField("year", year);
                                            solrDoc.addField("month", month);
                                            solrDoc.addField("version", version);

                                            if(!version.equals(""))
                                                solrDoc.addField("sorting", year + "_" + month + "_" + version);
                                            else
                                                solrDoc.addField("sorting", year + "_" + month);

                                            solrDoc.addField("stream_size", file.length());
                                            solrDoc.addField("resource_name", file.getName().replaceAll("htm", "pdf"));
                                            solrDoc.addField("content_type", "pdf");
                                            // Adding the document to Solr
                                            solrClient.add(solrDoc);
                                        }
                                    }
                                    if(file.exists())
                                        file.delete();
                                }
                            }

                            System.out.println("Committing...");

                            // Saving the changes
                            solrClient.commit();

                            System.out.println("Completed!...");
                            modifiedFileList.clear();
                        } if(deletedFileList.size() > 0) {

                            for(File f : deletedFileList) {

                                try {

                                    UpdateRequest ur = new UpdateRequest();

                                    ur.deleteByQuery("resource_name:\"" + f.getName() + "\"");
                                    solrClient.request(ur);

                                    System.out.println(f.getAbsolutePath() + " has been deleted from index data.");

                                } catch (Exception e) {
                                    e.printStackTrace();
                                }
                            }

                            System.out.println("Commiting...");

                            // Saving the changes
                            solrClient.commit();

                            System.out.println("Completed!...");
                            deletedFileList.clear();
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            };

            observer.addListener(listener);
            monitor.addObserver(observer);
            monitor.start();

        }
    }
}

AnalysePdf.java

package org.solr.extract.pdf;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.List;

/**
 * 
 *
 *
 */
public class AnalysePdf {

    /**
     * 
     * @param pdffolder
     * @throws IOException
     */
    public static void analysePdfFolder(String pdffolder) throws IOException {
        String htmout = "";

        File dir = new File(pdffolder);

        if (dir.isDirectory() == false) {
            System.out.println("Pdf Dir does not exists : " + pdffolder);
            return;
        }

        // list out all the file name and filter by the extension
        String[] list = dir.list();

        if (list.length == 0) {
            System.out.println("Pdf Dir no files end with : " + pdffolder);
            return;
        }

        for (String pdffilename : list) {

            if (!pdffilename.endsWith(".pdf"))
                continue;

            int dotpos = pdffilename.lastIndexOf(".");

            String pdfsubname = pdffilename.substring(0, dotpos);

            String pdfpagepath = pdffolder + pdfsubname + File.separator;

            String pdffullname = pdffolder + pdffilename;

            htmout = pdffullname.replace(".pdf", ".htm");

            if (new File(htmout).exists())
                continue;

            // Split pages of one pdf

            ReadPdf.Splitepages(pdffullname, pdfpagepath);

            // Extract subpages
            ReadPdf.AnalysisFolder(pdfpagepath);

            // Merge subpages to one htm
            ReadPdf.MergePages(htmout, pdfpagepath);

            FileUtils.forceDelete(new File(pdfpagepath));

        }
    }

    /**
     * 
     * @param file_list
     * @throws IOException
     */
    public static void analyseListPdf(List<File> list, String tmpDir) throws IOException {

        String htmout = "";

        if (list.size() == 0) {
            System.out.println("No files");     
            return;
        }

        for (File file : list) {

            if (!file.getAbsolutePath().endsWith(".pdf"))
                continue;

            // create splited pdfs in tmpDir\pdfname directory
            String pdfpagepath = tmpDir + File.separator + file.getName().substring(0, file.getName().lastIndexOf(".")) + File.separator;

            // create html files in tempDir directory
            htmout = tmpDir + File.separator + file.getName().replace(".pdf", ".htm");

            if (new File(htmout).exists())
                continue;

            // Split pages of one pdf
            ReadPdf.Splitepages(file.getAbsolutePath(), pdfpagepath);

            // Extract subpages
            ReadPdf.AnalysisFolder(pdfpagepath);

            // Merge subpages to one htm
            ReadPdf.MergePages(htmout, pdfpagepath);

            FileUtils.forceDelete(new File(pdfpagepath));

        }
    }
} 

0 个答案:

没有答案