使用IntField的Lucene搜索查询在文档更新后无法正常工作

时间:2013-06-14 13:14:41

标签: java lucene

我正在尝试使用他们的id和Intfield的值在Lucene中对一组两个文档运行一个简单的查询。查询在添加后立即正确返回两个字段。 现在,我使用检索到的文档并对CONTEXT_FIELD(在查询中未使用)进行更改,并更新索引中的文档。

有趣的是,现在搜索不返回任何结果,既不返回旧文档也不返回新文档。如果我只在查询中使用METHOD_NAME字段,那么一切都按预期工作,问题似乎是NUMBER_OF_ARGUMENTS IntField

为什么会这样?

示例代码:

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class LuceneDemo {

private static final String ID1 = "Great#text";
private static final String ID2 = "Another#bonus";

    private static final String METHOD_NAME_FIELD = "method_name";
    private static final String NUMBER_OF_ARGUMENTS = "number_of_arguments";
    private static final String CONTEXT_FIELD = "context";

    /** Parser used to parse queries */
    private static QueryParser parser = new QueryParser(Version.LUCENE_43,
            METHOD_NAME_FIELD, createDefaultAnalyzer());

    public static void main(String[] args) throws IOException, ParseException {
        IndexWriter luceneIndexWriter = new IndexWriter(
                FSDirectory.open(new File("/tmp/test")), createWriterConfig(64));
        Document doc1 = createDocument(ID1, "context1", 1);
        luceneIndexWriter.addDocument(doc1);
        Document doc2 = createDocument(ID2, "context2", 2);
        luceneIndexWriter.addDocument(doc2);

        System.out.println("Found doc1: "
                + findDocument(ID1, 1, luceneIndexWriter));
        System.out.println("Found doc2: "
                + findDocument(ID2, 2, luceneIndexWriter));
        doc1 = findDocument(ID1, 1, luceneIndexWriter);

        // Section 1
        doc1.removeField(CONTEXT_FIELD);
        doc1.add(new TextField(CONTEXT_FIELD, "context1_changed",
                Field.Store.YES));
        luceneIndexWriter.updateDocument(new Term(METHOD_NAME_FIELD, "text"),
                doc1);

        System.out.println("Found doc1: "
                + findDocument(ID1, 1, luceneIndexWriter));
        System.out.println("Found doc2: "
                + findDocument(ID2, 2, luceneIndexWriter));

        // Section 2
        // doc1 = findDocument(ID1, 1, luceneIndexWriter); <- null
        doc1.removeField(CONTEXT_FIELD);
        doc1.add(new TextField(CONTEXT_FIELD, "context1_changed2",
                Field.Store.YES));
        luceneIndexWriter.updateDocument(new Term(METHOD_NAME_FIELD, "text"),
                doc1);

        System.out.println("Found doc1: "
                + findDocument(ID1, 1, luceneIndexWriter));
        System.out.println("Found doc2: "
                + findDocument(ID2, 2, luceneIndexWriter));

        luceneIndexWriter.close();
    }

    private static Document createDocument(String id, String context, int value) {
        Document doc = new Document();
        doc.add(new TextField(METHOD_NAME_FIELD, id, Field.Store.YES));
        doc.add(new TextField(CONTEXT_FIELD, context, Field.Store.YES));
        doc.add(new IntField(NUMBER_OF_ARGUMENTS, value, Field.Store.YES));
        return doc;
    }

    private static Document findDocument(String id, int value,
            IndexWriter luceneIndexWriter) throws IOException, ParseException {
        DirectoryReader reader = DirectoryReader.open(luceneIndexWriter, true);
        IndexSearcher searcher = new IndexSearcher(reader);
        String[] split = id.split("#");
        Query methodQuery = parser.parse(split[1]);
        Query classQuery = parser.parse(split[0]);
        NumericRangeQuery<Integer> range = NumericRangeQuery.newIntRange(
                NUMBER_OF_ARGUMENTS, 1, value, value, true, true);
        BooleanQuery query = new BooleanQuery();
        query.add(methodQuery, Occur.MUST);
        query.add(classQuery, Occur.MUST);
        query.add(range, Occur.MUST);
        TopDocs result = searcher.search(query, 1);
        if (result.totalHits == 0) {
            System.err.println("Problem, nothing found (Method: " + id + ")");
            return null;
        }
        Document document = searcher.doc(result.scoreDocs[0].doc);
        if (document.get(METHOD_NAME_FIELD).equals(id)) {
            return document;
        }
        return null;
    }

    /** create the analyzer used */
    private static Analyzer createDefaultAnalyzer() {
        Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
        analyzerPerField.put(NUMBER_OF_ARGUMENTS, new KeywordAnalyzer());
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
                new SimpleAnalyzer(Version.LUCENE_43), analyzerPerField);
        return analyzer;
    }

    /** Creates the configuration used for writing. */
    public static IndexWriterConfig createWriterConfig(double ramBufferSizeMB) {
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43,
                createDefaultAnalyzer());
        config.setRAMBufferSizeMB(ramBufferSizeMB);
        config.setOpenMode(OpenMode.CREATE_OR_APPEND);
        config.setMaxBufferedDeleteTerms(1); // desperate try at
        config.setMaxBufferedDocs(2); // storing everything correctly right away
        // config.setInfoStream(System.out); <- set this for more output
        return config;
    }
}

输出:

Found doc1: Document<stored,indexed,tokenized<method_name:Great#text> stored,indexed,tokenized<context:context1> stored<number_of_arguments:1>>
Found doc2: Document<stored,indexed,tokenized<method_name:Another#bonus> stored,indexed,tokenized<context:context2> stored<number_of_arguments:2>>
Problem, nothing found (Method: Great#text)
Found doc1: null
Found doc2: Document<stored,indexed,tokenized<method_name:Another#bonus> stored,indexed,tokenized<context:context2> stored<number_of_arguments:2>>
Problem, nothing found (Method: Great#text)
Found doc1: null
Found doc2: Document<stored,indexed,tokenized<method_name:Another#bonus> stored,indexed,tokenized<context:context2> stored<number_of_arguments:2>>

输出包括config.setInfoStream(System.out):

http://bpaste.net/show/ko8kkxeFxZFE26NuecZc/(此处包含的时间太长,抱歉)

1 个答案:

答案 0 :(得分:4)

问题是您将NUMBER_OF_ARGUMENTS字段编入IntField索引,但从索引传回的版本是StoredField。重新编制索引后,它不再格式化为IntField,因此NumericRangeQuery无法获得任何结果。您只需将NUMBER_OF_ARGUMENTS上的查询字词设置为Occur.SHOULD子句,即可查看问题出在该字段上。

一种解决方案是手动将该字段重新添加到文档中,例如:

public static void main(String[] args) throws IOException, ParseException {
    IndexWriter luceneIndexWriter = new IndexWriter(
            FSDirectory.open(new File("/tmp/test")), createWriterConfig(64));
    Document doc1 = createDocument(ID1, "context1", 1);
    luceneIndexWriter.addDocument(doc1);
    Document doc2 = createDocument(ID2, "context2", 2);
    luceneIndexWriter.addDocument(doc2);

    System.out.println("Found doc1: "
            + findDocument(ID1, 1, luceneIndexWriter));
    System.out.println("Found doc2: "
            + findDocument(ID2, 2, luceneIndexWriter));
    doc1 = findDocument(ID1, 1, luceneIndexWriter);

    // Section 1
    doc1.removeField(CONTEXT_FIELD);
    doc1.add(new TextField(CONTEXT_FIELD, "context1_changed",
            Field.Store.YES));

    //re-adding the IntField here
    Number num = doc1.getField(NUMBER_OF_ARGUMENTS).numericValue();
    doc1.removeField(NUMBER_OF_ARGUMENTS);
    doc1.add(new IntField(NUMBER_OF_ARGUMENTS, num.intValue(),
            Field.Store.YES));

    luceneIndexWriter.updateDocument(new Term(METHOD_NAME_FIELD, "text"),
            doc1);

    System.out.println("Found doc1: "
            + findDocument(ID1, 1, luceneIndexWriter));
    System.out.println("Found doc2: "
            + findDocument(ID2, 2, luceneIndexWriter));

    // Section 2
    doc1 = findDocument(ID1, 1, luceneIndexWriter);
    doc1.removeField(CONTEXT_FIELD);
    doc1.add(new TextField(CONTEXT_FIELD, "context1_changed2",
            Field.Store.YES));
    luceneIndexWriter.updateDocument(new Term(METHOD_NAME_FIELD, "text"),
            doc1);
    num = doc1.getField(NUMBER_OF_ARGUMENTS).numericValue();
    doc1.removeField(NUMBER_OF_ARGUMENTS);
    doc1.add(new IntField(NUMBER_OF_ARGUMENTS, num.intValue(),
            Field.Store.YES));
    luceneIndexWriter.updateDocument(new Term(METHOD_NAME_FIELD, "text"),
            doc1);

    System.out.println("Found doc1: "
            + findDocument(ID1, 1, luceneIndexWriter));
    System.out.println("Found doc2: "
            + findDocument(ID2, 2, luceneIndexWriter));

    luceneIndexWriter.close();
}

更安全的方法是构建新的替换文档,而不是尝试修改和保留从索引中提取的文档。从索引中检索的文档的存储版本当然可能缺少关于如何索引字段的大量信息。


一方面,在创建构建小索引的测试函数时,我会使用:

config.setOpenMode(OpenMode.CREATE);

而不是CREATE_OR_APPEND。这允许您从空索引开始,因此结果更容易预测,并且您可以在每次重新构建时查看索引的内容,以便进行调试,例如:

public static void outputTheWholeThing(IndexWriter writer) throws IOException {
    DirectoryReader reader = DirectoryReader.open(writer, true);
    for (int i=0; i<reader.maxDoc(); i++) {
        Document doc = reader.document(i);
        System.out.println(doc);
    }
    System.out.println("Pending deletions:" + reader.numDeletedDocs());
}