Lucene搜索:索引需要永远

时间:2015-11-18 15:28:04

标签: java indexing lucene

我在Lucene的程序曾经在使用RAMDirectory时给出了outofmemory错误,所以我切换到FSDirectory;但是这次索引需要永远(超过5小时)并且不会结束。我对Lucene很新,请我做错了。请帮我检查一下我的代码。 谢谢

这是我的代码:

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class RechercheEngine extends LireFichierCollection {
    static Path path = Paths.get("Ressource/index-directory");
    static ArrayList<String> resultat = new ArrayList<String>();
    final static String file = "Ressources/file_collection.txt";

    public static void indexerEtRechercherDocument(boolean exchange)
            throws IOException, org.apache.lucene.queryparser.classic.ParseException {

        System.out.println("reading documents...");
        LireFichierCollection readDocCollection = new LireFichierCollection();
        readDocCollection.readFile(file);
        System.out.println("Analyzing documents...");
        Analyzer analyzer = new StandardAnalyzer();
        Directory directory = FSDirectory.open(path);
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        IndexWriter indexWriter = new IndexWriter(directory, config);

        Document doc = new Document();

        System.out.println("Indexing documents...");
        for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
            String key = entry.getKey();
            String content = entry.getValue();
            // indexing the docKey and Content of each document
            doc.add(new StringField("DocKey", key, Field.Store.YES));
            if (exchange) {
                Stemming st = new Stemming();
                doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
            } else {
                doc.add(new TextField("DocContent", content, Field.Store.NO));
            }
            indexWriter.addDocument(doc);
        }
        indexWriter.close();
        System.out.println("Indexing documents done");
        // I am checking if all documents were indexed properly using tester.txt

        LireRequete readQueries = new LireRequete();
        readQueries.readList();
        for (Map.Entry<String, String> entry : readQueries.queries.entrySet()) {
            Stemming t = new Stemming();
            Query query;
            if (exchange) {
                query = new QueryParser("DocContent", analyzer).parse(t.stemmingAvecStopWord(entry.getValue()));
            } else {
                query = new QueryParser("DocContent", analyzer).parse(entry.getValue());
            }
            System.out.println("Researching documents...");

            IndexReader reader = DirectoryReader.open(directory);
            IndexSearcher searcher = new IndexSearcher(reader);

            ScoreDoc[] hits = searcher.search(query, 2).scoreDocs;
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document hitDoc = searcher.doc(docId);
                System.out.println(hitDoc.get("DocKey"));
                resultat.add(entry.getKey() + " " + hitDoc.get("DocKey") + " " + hits[i].score);
            }
            reader.close();
        }
    }

}

1 个答案:

答案 0 :(得分:1)

您是否忘记清除文档字段?

Document doc = new Document();
System.out.println("Indexing documents...");
for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
    doc.clear(); //did you forget this line?
    String key = entry.getKey();
    String content = entry.getValue();
    // indexing the docKey and Content of each document
    doc.add(new StringField("DocKey", key, Field.Store.YES));
    if (exchange) {
        Stemming st = new Stemming();
        doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
    } else {
        doc.add(new TextField("DocContent", content, Field.Store.NO));
    }
    indexWriter.addDocument(doc);
}
indexWriter.close();

我添加了doc.clear();所以每次都清除它。现在你的文档应该越来越大,没有这条线。

希望这个修复。

PS: 或者在循环中创建新文档:

System.out.println("Indexing documents...");
for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
    Document doc = new Document();
    String key = entry.getKey();
    String content = entry.getValue();
    // indexing the docKey and Content of each document
    doc.add(new StringField("DocKey", key, Field.Store.YES));
    if (exchange) {
        Stemming st = new Stemming();
        doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
    } else {
        doc.add(new TextField("DocContent", content, Field.Store.NO));
    }
    indexWriter.addDocument(doc);
}
indexWriter.close();