在包含1亿字符串的大文件中使用lucene进行WildCARD搜索需要花费太多时间。我想在1-2秒内得到结果

时间:2013-05-11 15:20:37

标签: java lucene indexing large-files

我的文件大小为1.43 gb。该文件包含文件中逐行分隔的1亿个字符串(3 - 80个字符长度)。我正在使用lucene对文件进行WILDCARD搜索。现在我正在为每个字符串创建一个文档。我想要搜索关键字的总数( searchkeyword )。这是我的代码

lucene.demo.java

公共课LuceneDemo {

//a path to directory where Lucene will store index files
private static String indexDirectory = "C:\\indextofile";
// a path to directory which contains data files that need to be indexed
private static String dataDirectory = "C:\\indexofilef";
public static int count = 0;

private Searcher indexSearcher;

public static void main(String[] args) throws FileNotFoundException, IOException {  
    LuceneDemo luceneDemo = new LuceneDemo();   
    //create Lucene index
    luceneDemo.createLuceneIndex();
    //create IndexSearcher
    luceneDemo.createIndexSearcher();
    luceneDemo.termQueryExample();


}

private void createLuceneIndex(){
    Indexer indexer = new Indexer(indexDirectory,dataDirectory);
    //Create IndexWriter
    indexer.createIndexWriter();
    try {
        //Index data
        indexer.indexData();
    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}




private void createIndexSearcher() throws CorruptIndexException, IOException{
    /* Create instance of IndexSearcher 
     */
    indexSearcher = new IndexSearcher(indexDirectory);      
}

private void termQueryExample() throws CorruptIndexException, IOException{
    try
    {
    Directory directory = FSDirectory.getDirectory(indexDirectory);
    //IndexSearcher indexSearcher = new IndexSearcher(directory);
    BooleanQuery.setMaxClauseCount(102400000); 
    Term term = new Term("reversecontent", "bubble*com");
    Query query = new WildcardQuery(term);
    Hits hits = indexSearcher.search(query);    

    System.out.println("######## Hits :"+hits.length());
    }
    catch (Exception e) {
        e.printStackTrace();
    }
}   

}

Indexer.java

公共类索引器{     private IndexWriter indexWriter;

/*Location of directory where index files are stored */
private String indexDirectory ;

/*Location of data directory */
private String dataDirectory ;
public String  FIELD_CONTENTS = "contents";
public Indexer(String indexDirectory, String dataDirectory){
    this.indexDirectory = indexDirectory ;
    this.dataDirectory = dataDirectory ;
}


void createIndexWriter(){
    if(indexWriter == null){
        try{
            //Create instance of Directory where index files will be stored
            Directory fsDirectory =  FSDirectory.getDirectory(indexDirectory);
            /* Create instance of analyzer, which will be used to tokenize
            the input data */
            Analyzer standardAnalyzer = new KeywordAnalyzer();
            //Create a new index
            boolean create = true;
            //Create the instance of deletion policy
            IndexDeletionPolicy deletionPolicy = 
                                    new KeepOnlyLastCommitDeletionPolicy(); 
            indexWriter =
                 new IndexWriter(fsDirectory,standardAnalyzer,create,
                         deletionPolicy,IndexWriter.MaxFieldLength.UNLIMITED);
        }catch(IOException ie){
            System.out.println("Error in creating IndexWriter");
            throw new RuntimeException(ie);
        }
    }
}


void indexData() throws FileNotFoundException, IOException{

    File[] files = getFilesToBeIndxed();
    for(File file:files){

         FileReader fr = new FileReader(file);  
            // To store the contents read via File Reader  
            BufferedReader br = new BufferedReader(fr);                                                   
            // Read br and store a line in 'data', print data  
            String data;  
           System.out.println("start");
            while((data = br.readLine()) != null)   
            {  
                String newdata = data+".com";
                Document doc = new Document();
                //doc.add(new Field("content", newdata,
                //      Store.NO, Index.NOT_ANALYZED));
                doc.add(new Field("reversecontent", new StringBuffer(newdata).reverse().toString(),
                    Store.NO, Index.NOT_ANALYZED));
                indexWriter.addDocument(doc);
            } 
         System.out.println("end");
        // Add these fields to a Lucene Document

        //Step 3: Add this document to Lucene Index.
    }
    /* Requests an "optimize" operation on an index, priming the
    index for the fastest available search */
    indexWriter.optimize();
    System.out.println("optimization done");
    /*
     * Commits all changes to the index and closes all associated files. 
     */
    indexWriter.close();
}

private File[] getFilesToBeIndxed(){
    File dataDir  = new File(dataDirectory);
    if(!dataDir.exists()){
        throw new RuntimeException(dataDirectory+" does not exist");
    }
    File[] files = dataDir.listFiles();
    return files;
}

}

0 个答案:

没有答案