如何使用Java Apache Lucene检索PDF文档中的正则表达式搜索字母数字文本?

时间:2015-08-13 12:53:46

标签: java regex lucene pdfbox

**我想使用Java中的regex从PDF文档中搜索字母数字文本(发票编号F0000004511)。我怎么能这样做?例如,PDF第一页是这样的:

销售 - 发票T.I.N. No. 02020600021传真号码+ 91-1792-232268发票编号F0000004511

在PDF第二页发票中,没有更改为F0000004512以及具有相同编号的第三页和第四页。我需要根据发票号码搜索并拆分pdf页面。我使用APACHE LUCENE 3.4.0进行索引和搜索pdf。以下代码索引pdf **

public class Indexer {

    private final String sourceFilePath = "G:/PDFCopy";    //give the location of the source files location here
    private final String indexFilePath = "G:/searchEngine";   //give the location where you guys want to create index
    private IndexWriter writer = null;
    private File indexDirectory = null;
    private String fileContent;  //temp storer of all the text parsed from doc and pdf 


    private Indexer() throws FileNotFoundException, CorruptIndexException, IOException {
        try {
            long start = System.currentTimeMillis();
            createIndexWriter();
            checkFileValidity();
            closeIndexWriter();
            long end = System.currentTimeMillis();
            System.out.println("Total Document Indexed : " + TotalDocumentsIndexed());
            System.out.println("Total time" + (end - start) / (100 * 60));
        } catch (Exception e) {
            System.out.println("Sorry task cannot be completed");
        }
    }


    private void createIndexWriter() {
        try {
            indexDirectory = new File(indexFilePath);
            if (!indexDirectory.exists()) {
                indexDirectory.mkdir();
            }
            FSDirectory dir = FSDirectory.open(indexDirectory);
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_34, analyzer);
            writer = new IndexWriter(dir, config);
        } catch (Exception ex) {
            System.out.println("Sorry cannot get the index writer");
        }
    }


    private void checkFileValidity() {

        File[] filesToIndex = new File[100]; // suppose there are 100 files at max
        filesToIndex = new File(sourceFilePath).listFiles();
        for (File file : filesToIndex) {
            try {
                //to check whenther the file is a readable file or not.
                if (!file.isDirectory()
                        && !file.isHidden()
                        && file.exists()
                        && file.canRead()
                        && file.length() > 0.0
                        && file.isFile() ) {
                    if(file.getName().endsWith(".txt")){
                        indexTextFiles(file);//if the file text file no need to parse text. 
                    System.out.println("INDEXED FILE " + file.getAbsolutePath() + " :-) ");
                    }
                    else if(file.getName().endsWith(".doc") || file.getName().endsWith(".pdf")){
                        //different methof for indexing doc and pdf file.
                       StartIndex(file);                    
                    }
                }
            } catch (Exception e) {
                System.out.println("Sorry cannot index " + file.getAbsolutePath());
            }
        }
    }



    public void StartIndex(File file) throws FileNotFoundException, CorruptIndexException, IOException {
         fileContent = null;
        try {
            Document doc = new Document();
            if (file.getName().endsWith(".doc")) {
                //call the doc file parser and get the content of doc file in txt format
                fileContent = new DocFileParser().DocFileContentParser(file.getAbsolutePath());
            }
            if (file.getName().endsWith(".pdf")) {
                //call the pdf file parser and get the content of pdf file in txt format
                fileContent = new PdfFileParser().PdfFileParser(file.getAbsolutePath());
            }
            doc.add(new Field("content", fileContent,
                    Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.add(new Field("filename", file.getName(),
                    Field.Store.YES, Field.Index.ANALYZED));
            doc.add(new Field("fullpath", file.getAbsolutePath(),
                    Field.Store.YES, Field.Index.ANALYZED));
            if (doc != null) {
                writer.addDocument(doc);
            }
            System.out.println("Indexed" + file.getAbsolutePath());
        } catch (Exception e) {
            System.out.println("error in indexing" + (file.getAbsolutePath()));
        }
    }


    private void indexTextFiles(File file) throws CorruptIndexException, IOException {
        Document doc = new Document();
        doc.add(new Field("content", new FileReader(file)));
        doc.add(new Field("filename", file.getName(),
                Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field("fullpath", file.getAbsolutePath(),
                Field.Store.YES, Field.Index.ANALYZED));
        if (doc != null) {
            writer.addDocument(doc);
        }
    }


    private int TotalDocumentsIndexed() {
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(indexDirectory));
            return reader.maxDoc();
        } catch (Exception ex) {
            System.out.println("Sorry no index found");
        }
        return 0;
    }


    private void closeIndexWriter() {
        try {
            writer.optimize();
            writer.close();
        } catch (Exception e) {
            System.out.println("Indexer Cannot be closed");
        }
    }

    public static void main(String arg[]) {
        try {
            new Indexer();
        } catch (Exception ex) {
            System.out.println("Cannot Start :(");
        }
    }
}

以下代码搜索索引。我在这里直接搜索正则表达式。但是可以在所有pdf中使用正则表达式值进行搜索并阅读发票编号。最后我需要根据发票编号分割pdf。 我需要从正则表达式返回Invoice没有值并拆分pdf。 (SOURce pdf有60页,独特且重复的发票编号。)

public class Searcher {

    public Searcher(String searchString) {
        try {
            IndexSearcher searcher = new IndexSearcher(FSDirectory.open(
                    new File("G:/searchEngine")));
            Analyzer analyzer1 = new StandardAnalyzer(Version.LUCENE_34);
            QueryParser queryParser = new QueryParser(Version.LUCENE_34, "content", analyzer1);
            QueryParser queryParserfilename = new QueryParser(Version.LUCENE_34, "fullpath", analyzer1);
            Query query = queryParser.parse(searchString);//to search in the content
            Query queryfilename = queryParserfilename.parse(searchString);//to search the file name only        
            TopDocs hits = searcher.search(query, 10000); //for 
            ScoreDoc[] document = hits.scoreDocs;
            System.out.println("Total no of hits for content: " + hits.totalHits);


            for (int i = 0; i < document.length; i++) {
                Document doc = searcher.doc(document[i].doc);
                String filePath = doc.get("fullpath");
                System.out.println(filePath);
            }


        } catch (Exception e) {
        }

    }

    public static void main(String args[])
    {
       new Searcher("Invoice No.\\s\\w\\d\\d\\d\\d\\d\\d\\d\\d\\d\\d");
    } 
}

1 个答案:

答案 0 :(得分:0)

femtoRgon提出的解决方案:

  

好吧,您似乎正在使用QueryParser在Lucene 3.4版中生成查询。在我相信版本4.0之前,没有将正则表达式支持添加到QueryParser中。要使用正则表达式进行搜索,您需要手动构建RegexQuery