我正在使用基于PDFBox的工具来索引PDF文件,并将其与正确的数据一起发送到Solr。我正在尝试做的是让PDFBox获得PDF的页码并将其索引到Solr。当我在Solr中搜索特定的关键字时,我需要创建一种方法,使结果中也包含我的结果所在的页码。 因此,我将需要获取PDF的页码。
public static void main(String[] args) throws Exception {
String sdir = null;
String dir = null;
long pollingInterval = 10000;
PrintStream startupOut = System.out;
CommandLine cl = getCommandLine(startupOut, args);
if (cl == null) {
System.out.println("System exit. Your arguments was invalid. Please try again.");
return;
}
if (!cl.hasOption('s')) {
System.out.println("System exit. Your arguments was invalid. Please try again.");
return;
}
System.out.println("Welcome to use this program! This program is running as a service. To stop ctrl+c.");
System.out.println("Monitoring is started");
String dOption = cl.getOptionValue('d');
if (cl.hasOption('d')) {
dir = dOption;
}
if(dir != null) {
if(dir.lastIndexOf("/") < dir.length() - 1)
dir += "/";
AnalysePdf.analysePdfFolder(dir);
SolrClient solrClient = new HttpSolrClient.Builder(solrServerUrl).build();
FileUtil.allFiles.clear();
List<File> htmlfiles = FileUtil.getFiles(new File(dir));
if (htmlfiles.size() > 0) {
for (File file : htmlfiles) {
if (!FileUtil.getFileType(file).equalsIgnoreCase("htm"))
continue;
Document doc = Jsoup.parse(file, "UTF-8");
Elements headers = doc.select("p.header");
BufferedReader br1 = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
StringBuilder sb = new StringBuilder();
String line = null;
try {
while ((line = br1.readLine()) != null) {
sb.append(line.trim());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
br1.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
String content = sb.toString();
String fileName = file.getName().substring(0, file.getName().lastIndexOf("."));
String year = fileName.substring(0, 4);
String month = fileName.substring(5, 7);
String version = "";
int len = fileName.length();
for (int i = 1; i < len; i++) {
if (fileName.substring(len - i, len - i + 1).equalsIgnoreCase("v")) {
version = fileName.substring(len - i);
break;
}
}
for (Element header : headers) {
String headerValue = header.text().trim();
if (!headerValue.equals("") && !headerValue.startsWith("http")) {
Element headerElement = header.nextElementSibling();
String text = headerElement.text();
if (headerElement.nextElementSibling() != null) {
if (headerElement.nextElementSibling().hasClass("text"))
text += headerElement.nextElementSibling().text();
}
// Preparing the Solr document
SolrInputDocument solrDoc = new SolrInputDocument();
// Adding fields to the document
solrDoc.addField("content", content);
solrDoc.addField("paragraph_header", header.text());
solrDoc.addField("paragraph_txt", text);
solrDoc.addField("url", dir + file.getName().replaceAll("htm", "pdf"));
solrDoc.addField("year", year);
solrDoc.addField("month", month);
solrDoc.addField("version", version);
if(!version.equals(""))
solrDoc.addField("sorting", year + "_" + month + "_" + version);
else
solrDoc.addField("sorting", year + "_" + month);
solrDoc.addField("stream_size", file.length());
solrDoc.addField("resource_name", file.getName().replaceAll("htm", "pdf"));
solrDoc.addField("content_type", "pdf");
// Adding the document to Solr
solrClient.add(solrDoc);
}
}
if(file.exists())
file.delete();
}
}
System.out.println("Commiting...");
// Saving the changes
solrClient.commit();
System.out.println("Completed!...");
} else {
String sOption = cl.getOptionValue('s');
if (cl.hasOption('s')) {
sdir = String.valueOf(sOption);
if(sdir.lastIndexOf("/") < sdir.length() - 1)
sdir += "/";
rootDir = sdir;
}
if(sdir != null) {
File parentDir = new File(sdir).getParentFile();
if(parentDir.exists())
{
tmpDir = new File(parentDir.getAbsolutePath() + File.separator + "tmp");
if(!tmpDir.exists())
tmpDir.mkdir();
}
}
String tOption = cl.getOptionValue('t');
if (cl.hasOption('t')) {
pollingInterval = Long.valueOf(tOption);
}
File folder = new File(sdir);
if (!folder.exists()) {
// Test to see if monitored folder exists
throw new RuntimeException("Directory not found: " + sdir);
}
Set<String> modifiedPaths = new HashSet<String>();
List<File> modifiedFileList = new ArrayList<File>();
Set<String> deletedPaths = new HashSet<String>();
List<File> deletedFileList = new ArrayList<File>();
FileAlterationObserver observer = new FileAlterationObserver(folder);
FileAlterationMonitor monitor = new FileAlterationMonitor(pollingInterval);
FileAlterationListenerAdaptor listener = new FileAlterationListenerAdaptor() {
// Is triggered when a file is created in the monitored folder
@Override
public void onFileCreate(File file) {
// "file" is the reference to the newly created file
try {
if(modifiedPaths.add(file.getCanonicalPath()))
modifiedFileList.add(file);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// Is triggered when a file is created in the monitored folder
@Override
public void onFileChange(File file) {
// "file" is the reference to the newly created file
try {
if(modifiedPaths.add(file.getCanonicalPath()))
modifiedFileList.add(file);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// Is triggered when a file is deleted from the monitored folder
@Override
public void onFileDelete(File file) {
try {
if(deletedPaths.add(file.getCanonicalPath()))
deletedFileList.add(file);
} catch (IOException e) {
e.printStackTrace(System.err);
}
}
AnalysePdf代码:
package org.solr.extract.pdf;
import org.apache.commons.io.FileUtils;
import java.io.File;
import java.io.IOException;
import java.util.List;
/**
*
*
*
*/
public class AnalysePdf {
/**
*
* @param pdffolder
* @throws IOException
*/
public static void analysePdfFolder(String pdffolder) throws IOException {
String htmout = "";
File dir = new File(pdffolder);
if (dir.isDirectory() == false) {
System.out.println("Pdf Dir does not exists : " + pdffolder);
return;
}
// list out all the file name and filter by the extension
String[] list = dir.list();
if (list.length == 0) {
System.out.println("Pdf Dir no files end with : " + pdffolder);
return;
}
for (String pdffilename : list) {
if (!pdffilename.endsWith(".pdf"))
continue;
int dotpos = pdffilename.lastIndexOf(".");
String pdfsubname = pdffilename.substring(0, dotpos);
String pdfpagepath = pdffolder + pdfsubname + File.separator;
String pdffullname = pdffolder + pdffilename;
htmout = pdffullname.replace(".pdf", ".htm");
if (new File(htmout).exists())
continue;
// Split pages of one pdf
ReadPdf.Splitepages(pdffullname, pdfpagepath);
// Extract subpages
ReadPdf.AnalysisFolder(pdfpagepath);
// Merge subpages to one htm
ReadPdf.MergePages(htmout, pdfpagepath);
FileUtils.forceDelete(new File(pdfpagepath));
}
}
/**
*
* @param file_list
* @throws IOException
*/
public static void analyseListPdf(List<File> list, String tmpDir) throws IOException {
String htmout = "";
if (list.size() == 0) {
System.out.println("No files");
return;
}
for (File file : list) {
if (!file.getAbsolutePath().endsWith(".pdf"))
continue;
// create splited pdfs in tmpDir\pdfname directory
String pdfpagepath = tmpDir + File.separator + file.getName().substring(0, file.getName().lastIndexOf(".")) + File.separator;
// create html files in tempDir directory
htmout = tmpDir + File.separator + file.getName().replace(".pdf", ".htm");
if (new File(htmout).exists())
continue;
// Split pages of one pdf
ReadPdf.Splitepages(file.getAbsolutePath(), pdfpagepath);
// Extract subpages
ReadPdf.AnalysisFolder(pdfpagepath);
// Merge subpages to one htm
ReadPdf.MergePages(htmout, pdfpagepath);
FileUtils.forceDelete(new File(pdfpagepath));
}
}
}