我正在建立一个索引PDF文件的系统和一个搜索应用程序。
我正在使用Solr 7.4.0和PDFBox作为索引工具。
我遇到了一个问题,就是我需要以这种方式索引PDF文件才能拥有页码。这是因为我想要实现的是,当我搜索关键字并获取结果时,我还希望看到指向PDF文件中关键字位置的链接。我想要类似http://myserver/files/selectedfile.pdf#page=5
因此,用户单击链接并看到关键字在PDF中的页面。我不明白如何实现这一目标。我需要与关键字及其在PDF中的位置相关。
到目前为止,我所做的是构建基于PDFBox的工具,并在主要的java和AnalysePdf.java下共享该工具,如果需要更多信息,可以理解该问题,我可以在此处发布或向您收录完整代码的工具。< / p>
package org.solr.index.pdf.jakub;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.monitor.FileAlterationListenerAdaptor;
import org.apache.commons.io.monitor.FileAlterationMonitor;
import org.apache.commons.io.monitor.FileAlterationObserver;
import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrInputDocument;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.solr.extract.pdf.AnalysePdf;
/**
*
*
*
*
*/
public class Monitor {
private static String solrServerUrl = "http://localhost:8983/solr/docs"; /* URL to Solr + Core Name -> change docs to your core name if need it*/
private static File tmpDir = null;
private static String rootDir = null;
public Monitor() {}
private static void usage(PrintStream out, String[] args) {
HelpFormatter hf = new HelpFormatter();
hf.printHelp("IdxWatchService", options());
out.println("Your arguments were: " + StringUtils.join(args, ' '));
}
private static Options options() {
Options options = new Options();
options.addOption("s", "service", true, "The Watching Directory which included pdf files.");
options.addOption("d", "dir", true, "The Root Directory which included pdf files.");
options.addOption("t", "time", true, "The interval time (ms).");
return options;
}
private static CommandLine getCommandLine(PrintStream out, String[] args) {
CommandLineParser clp = new GnuParser();
CommandLine cl;
try {
cl = clp.parse(options(), args);
} catch (ParseException e) {
usage(out, args);
return null;
}
if (cl.getOptions().length == 0) {
usage(out, args);
return null;
}
return cl;
}
public static void main(String[] args) throws Exception {
String sdir = null;
String dir = null;
long pollingInterval = 10000;
PrintStream startupOut = System.out;
CommandLine cl = getCommandLine(startupOut, args);
if (cl == null) {
System.out.println("System exit. Your arguments was invalid. Please try again.");
return;
}
if (!cl.hasOption('s')) {
System.out.println("System exit. Your arguments was invalid. Please try again.");
return;
}
System.out.println("Welcome to use this program! This program is running as a service. To stop ctrl+c.");
System.out.println("Monitoring is started");
String dOption = cl.getOptionValue('d');
if (cl.hasOption('d')) {
dir = dOption;
}
if(dir != null) {
if(dir.lastIndexOf("/") < dir.length() - 1)
dir += "/";
AnalysePdf.analysePdfFolder(dir);
SolrClient solrClient = new HttpSolrClient.Builder(solrServerUrl).build();
FileUtil.allFiles.clear();
List<File> htmlfiles = FileUtil.getFiles(new File(dir));
if (htmlfiles.size() > 0) {
for (File file : htmlfiles) {
if (!FileUtil.getFileType(file).equalsIgnoreCase("htm"))
continue;
Document doc = Jsoup.parse(file, "UTF-8");
Elements headers = doc.select("p.header");
BufferedReader br1 = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
StringBuilder sb = new StringBuilder();
String line = null;
try {
while ((line = br1.readLine()) != null) {
sb.append(line.trim());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
br1.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
String content = sb.toString();
String fileName = file.getName().substring(0, file.getName().lastIndexOf("."));
String year = fileName.substring(0, 4);
String month = fileName.substring(5, 7);
String version = "";
int len = fileName.length();
for (int i = 1; i < len; i++) {
if (fileName.substring(len - i, len - i + 1).equalsIgnoreCase("v")) {
version = fileName.substring(len - i);
break;
}
}
for (Element header : headers) {
String headerValue = header.text().trim();
if (!headerValue.equals("") && !headerValue.startsWith("http")) {
Element headerElement = header.nextElementSibling();
String text = headerElement.text();
if (headerElement.nextElementSibling() != null) {
if (headerElement.nextElementSibling().hasClass("text"))
text += headerElement.nextElementSibling().text();
}
// Preparing the Solr document
SolrInputDocument solrDoc = new SolrInputDocument();
// Adding fields to the document
solrDoc.addField("content", content);
solrDoc.addField("paragraph_header", header.text());
solrDoc.addField("paragraph_txt", text);
solrDoc.addField("url", dir + file.getName().replaceAll("htm", "pdf"));
solrDoc.addField("year", year);
solrDoc.addField("month", month);
solrDoc.addField("version", version);
if(!version.equals(""))
solrDoc.addField("sorting", year + "_" + month + "_" + version);
else
solrDoc.addField("sorting", year + "_" + month);
solrDoc.addField("stream_size", file.length());
solrDoc.addField("resource_name", file.getName().replaceAll("htm", "pdf"));
solrDoc.addField("content_type", "pdf");
// Adding the document to Solr
solrClient.add(solrDoc);
}
}
if(file.exists())
file.delete();
}
}
System.out.println("Commiting...");
// Saving the changes
solrClient.commit();
System.out.println("Completed!...");
} else {
String sOption = cl.getOptionValue('s');
if (cl.hasOption('s')) {
sdir = String.valueOf(sOption);
if(sdir.lastIndexOf("/") < sdir.length() - 1)
sdir += "/";
rootDir = sdir;
}
if(sdir != null) {
File parentDir = new File(sdir).getParentFile();
if(parentDir.exists())
{
tmpDir = new File(parentDir.getAbsolutePath() + File.separator + "tmp");
if(!tmpDir.exists())
tmpDir.mkdir();
}
}
String tOption = cl.getOptionValue('t');
if (cl.hasOption('t')) {
pollingInterval = Long.valueOf(tOption);
}
File folder = new File(sdir);
if (!folder.exists()) {
// Test to see if monitored folder exists
throw new RuntimeException("Directory not found: " + sdir);
}
Set<String> modifiedPaths = new HashSet<String>();
List<File> modifiedFileList = new ArrayList<File>();
Set<String> deletedPaths = new HashSet<String>();
List<File> deletedFileList = new ArrayList<File>();
FileAlterationObserver observer = new FileAlterationObserver(folder);
FileAlterationMonitor monitor = new FileAlterationMonitor(pollingInterval);
FileAlterationListenerAdaptor listener = new FileAlterationListenerAdaptor() {
// Is triggered when a file is created in the monitored folder
@Override
public void onFileCreate(File file) {
// "file" is the reference to the newly created file
try {
if(modifiedPaths.add(file.getCanonicalPath()))
modifiedFileList.add(file);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// Is triggered when a file is created in the monitored folder
@Override
public void onFileChange(File file) {
// "file" is the reference to the newly created file
try {
if(modifiedPaths.add(file.getCanonicalPath()))
modifiedFileList.add(file);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// Is triggered when a file is deleted from the monitored folder
@Override
public void onFileDelete(File file) {
try {
if(deletedPaths.add(file.getCanonicalPath()))
deletedFileList.add(file);
} catch (IOException e) {
e.printStackTrace(System.err);
}
}
@Override
public void onStop(FileAlterationObserver observer) {
SolrClient solrClient = new HttpSolrClient.Builder(solrServerUrl).build();
try {
if(modifiedFileList.size() > 0) {
AnalysePdf.analyseListPdf(modifiedFileList, tmpDir.getAbsolutePath());
FileUtil.allFiles.clear();
List<File> htmlfiles = FileUtil.getFiles(new File(tmpDir.getAbsolutePath()));
if (htmlfiles.size() > 0) {
for (File file : htmlfiles) {
if (!FileUtil.getFileType(file).equalsIgnoreCase("htm"))
continue;
Document doc = Jsoup.parse(file, "UTF-8");
Elements headers = doc.select("p.header");
BufferedReader br1 = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
StringBuilder sb = new StringBuilder();
String line = null;
try {
while ((line = br1.readLine()) != null) {
sb.append(line.trim());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
br1.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
String content = sb.toString();
String fileName = file.getName().substring(0, file.getName().lastIndexOf("."));
String year = fileName.substring(0, 4);
String month = fileName.substring(5, 7);
String version = "";
int len = fileName.length();
for (int i = 1; i < len; i++) {
if (fileName.substring(len - i, len - i + 1).equalsIgnoreCase("v")) {
version = fileName.substring(len - i);
break;
}
}
for (Element header : headers) {
String headerValue = header.text().trim();
if (!headerValue.equals("") && !headerValue.startsWith("http")) {
Element headerElement = header.nextElementSibling();
String text = headerElement.text();
if (headerElement.nextElementSibling() != null) {
if (headerElement.nextElementSibling().hasClass("text"))
text += headerElement.nextElementSibling().text();
}
// Preparing the Solr document
SolrInputDocument solrDoc = new SolrInputDocument();
// Adding fields to the document
solrDoc.addField("content", content);
solrDoc.addField("paragraph_header", header.text());
solrDoc.addField("paragraph_txt", text);
solrDoc.addField("url", rootDir + file.getName().replaceAll("htm", "pdf"));
solrDoc.addField("year", year);
solrDoc.addField("month", month);
solrDoc.addField("version", version);
if(!version.equals(""))
solrDoc.addField("sorting", year + "_" + month + "_" + version);
else
solrDoc.addField("sorting", year + "_" + month);
solrDoc.addField("stream_size", file.length());
solrDoc.addField("resource_name", file.getName().replaceAll("htm", "pdf"));
solrDoc.addField("content_type", "pdf");
// Adding the document to Solr
solrClient.add(solrDoc);
}
}
if(file.exists())
file.delete();
}
}
System.out.println("Committing...");
// Saving the changes
solrClient.commit();
System.out.println("Completed!...");
modifiedFileList.clear();
} if(deletedFileList.size() > 0) {
for(File f : deletedFileList) {
try {
UpdateRequest ur = new UpdateRequest();
ur.deleteByQuery("resource_name:\"" + f.getName() + "\"");
solrClient.request(ur);
System.out.println(f.getAbsolutePath() + " has been deleted from index data.");
} catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("Commiting...");
// Saving the changes
solrClient.commit();
System.out.println("Completed!...");
deletedFileList.clear();
}
} catch (Exception e) {
e.printStackTrace();
}
}
};
observer.addListener(listener);
monitor.addObserver(observer);
monitor.start();
}
}
}
AnalysePdf.java
package org.solr.extract.pdf;
import org.apache.commons.io.FileUtils;
import java.io.File;
import java.io.IOException;
import java.util.List;
/**
*
*
*
*/
public class AnalysePdf {
/**
*
* @param pdffolder
* @throws IOException
*/
public static void analysePdfFolder(String pdffolder) throws IOException {
String htmout = "";
File dir = new File(pdffolder);
if (dir.isDirectory() == false) {
System.out.println("Pdf Dir does not exists : " + pdffolder);
return;
}
// list out all the file name and filter by the extension
String[] list = dir.list();
if (list.length == 0) {
System.out.println("Pdf Dir no files end with : " + pdffolder);
return;
}
for (String pdffilename : list) {
if (!pdffilename.endsWith(".pdf"))
continue;
int dotpos = pdffilename.lastIndexOf(".");
String pdfsubname = pdffilename.substring(0, dotpos);
String pdfpagepath = pdffolder + pdfsubname + File.separator;
String pdffullname = pdffolder + pdffilename;
htmout = pdffullname.replace(".pdf", ".htm");
if (new File(htmout).exists())
continue;
// Split pages of one pdf
ReadPdf.Splitepages(pdffullname, pdfpagepath);
// Extract subpages
ReadPdf.AnalysisFolder(pdfpagepath);
// Merge subpages to one htm
ReadPdf.MergePages(htmout, pdfpagepath);
FileUtils.forceDelete(new File(pdfpagepath));
}
}
/**
*
* @param file_list
* @throws IOException
*/
public static void analyseListPdf(List<File> list, String tmpDir) throws IOException {
String htmout = "";
if (list.size() == 0) {
System.out.println("No files");
return;
}
for (File file : list) {
if (!file.getAbsolutePath().endsWith(".pdf"))
continue;
// create splited pdfs in tmpDir\pdfname directory
String pdfpagepath = tmpDir + File.separator + file.getName().substring(0, file.getName().lastIndexOf(".")) + File.separator;
// create html files in tempDir directory
htmout = tmpDir + File.separator + file.getName().replace(".pdf", ".htm");
if (new File(htmout).exists())
continue;
// Split pages of one pdf
ReadPdf.Splitepages(file.getAbsolutePath(), pdfpagepath);
// Extract subpages
ReadPdf.AnalysisFolder(pdfpagepath);
// Merge subpages to one htm
ReadPdf.MergePages(htmout, pdfpagepath);
FileUtils.forceDelete(new File(pdfpagepath));
}
}
}