我在Lucene的程序曾经在使用RAMDirectory时给出了outofmemory错误,所以我切换到FSDirectory;但是这次索引需要永远(超过5小时)并且不会结束。我对Lucene很新,请我做错了。请帮我检查一下我的代码。 谢谢
这是我的代码:
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class RechercheEngine extends LireFichierCollection {
static Path path = Paths.get("Ressource/index-directory");
static ArrayList<String> resultat = new ArrayList<String>();
final static String file = "Ressources/file_collection.txt";
public static void indexerEtRechercherDocument(boolean exchange)
throws IOException, org.apache.lucene.queryparser.classic.ParseException {
System.out.println("reading documents...");
LireFichierCollection readDocCollection = new LireFichierCollection();
readDocCollection.readFile(file);
System.out.println("Analyzing documents...");
Analyzer analyzer = new StandardAnalyzer();
Directory directory = FSDirectory.open(path);
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
Document doc = new Document();
System.out.println("Indexing documents...");
for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
String key = entry.getKey();
String content = entry.getValue();
// indexing the docKey and Content of each document
doc.add(new StringField("DocKey", key, Field.Store.YES));
if (exchange) {
Stemming st = new Stemming();
doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
} else {
doc.add(new TextField("DocContent", content, Field.Store.NO));
}
indexWriter.addDocument(doc);
}
indexWriter.close();
System.out.println("Indexing documents done");
// I am checking if all documents were indexed properly using tester.txt
LireRequete readQueries = new LireRequete();
readQueries.readList();
for (Map.Entry<String, String> entry : readQueries.queries.entrySet()) {
Stemming t = new Stemming();
Query query;
if (exchange) {
query = new QueryParser("DocContent", analyzer).parse(t.stemmingAvecStopWord(entry.getValue()));
} else {
query = new QueryParser("DocContent", analyzer).parse(entry.getValue());
}
System.out.println("Researching documents...");
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
ScoreDoc[] hits = searcher.search(query, 2).scoreDocs;
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document hitDoc = searcher.doc(docId);
System.out.println(hitDoc.get("DocKey"));
resultat.add(entry.getKey() + " " + hitDoc.get("DocKey") + " " + hits[i].score);
}
reader.close();
}
}
}
答案 0 :(得分:1)
您是否忘记清除文档字段?
Document doc = new Document();
System.out.println("Indexing documents...");
for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
doc.clear(); //did you forget this line?
String key = entry.getKey();
String content = entry.getValue();
// indexing the docKey and Content of each document
doc.add(new StringField("DocKey", key, Field.Store.YES));
if (exchange) {
Stemming st = new Stemming();
doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
} else {
doc.add(new TextField("DocContent", content, Field.Store.NO));
}
indexWriter.addDocument(doc);
}
indexWriter.close();
我添加了doc.clear();
所以每次都清除它。现在你的文档应该越来越大,没有这条线。
希望这个修复。
PS: 或者在循环中创建新文档:
System.out.println("Indexing documents...");
for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
Document doc = new Document();
String key = entry.getKey();
String content = entry.getValue();
// indexing the docKey and Content of each document
doc.add(new StringField("DocKey", key, Field.Store.YES));
if (exchange) {
Stemming st = new Stemming();
doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
} else {
doc.add(new TextField("DocContent", content, Field.Store.NO));
}
indexWriter.addDocument(doc);
}
indexWriter.close();