Question

我使用lucene进行N-Gram匹配。我使用N-Gram分析仪设置了要分析的区域。我想看看分析得到的标记是如何确保正确计算n-gram的。

如果我在文档的分析字段上调用方法Fieldable.tokenStreamValue()，我会得到null，而调用Fieldable.isTokenized()则返回true。

我必须补充一点，查询结果与正确生成的n-gram一致。

对此有何解释？我基本上试图做这里提到的： How can I read a Lucene document field tokens after they are analyzed?

以下是完整代码：

public class TestLuceneNgram {

public static class NGramQuery extends BooleanQuery {

    public NGramQuery(final String queryTerm) throws IOException {

        StringReader strReader = new StringReader(queryTerm);
        TokenStream tokens = new NGramTokenizer(strReader);

        CharTermAttribute termAtt = (CharTermAttribute) tokens
                .addAttribute(CharTermAttribute.class);

        while (tokens.incrementToken()) {
            System.out.println(termAtt);
            Term t = new Term("NGRAM_FIELD", termAtt.toString());
            add(new TermQuery(t), BooleanClause.Occur.SHOULD);

        }

    }
}

public static class NGramSearcher extends IndexSearcher {

    public NGramSearcher(final Directory directory)
            throws CorruptIndexException, IOException {
        super(IndexReader.open(directory));
    }

    public TopDocs search(final String term) {
        try {
            return search(new NGramQuery(term), 10);
        } catch (IOException e) {
            e.printStackTrace();
        }

        return null;
    }
}

public static class SubWordAnalyzer extends Analyzer {

    @Override
    public TokenStream tokenStream(final String fieldName,
            final Reader reader) {
        return new NGramTokenizer(reader);
    }

}

public static Directory index(final String[] terms) {

    Directory indexDirectory = new RAMDirectory();

    IndexWriter indexWriter = null;
    try {
        indexWriter = new IndexWriter(indexDirectory,
                new IndexWriterConfig(Version.LUCENE_32,
                        new SubWordAnalyzer()));
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    for (int i = 0; i < terms.length; ++i) {
        Document doc = new Document();
        doc.add(new Field("NGRAM_FIELD", terms[i], Field.Store.YES,
                Field.Index.ANALYZED,
                Field.TermVector.WITH_POSITIONS_OFFSETS));
        doc.add(new Field("ORIGINAL_FIELD", terms[i], Field.Store.YES,
                Field.Index.NOT_ANALYZED));

        try {
            indexWriter.addDocument(doc);
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    try {
        indexWriter.optimize();
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    try {
        indexWriter.close();
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return indexDirectory;
}

/**
 * @param args
 */
public static void main(final String[] args) {

    String[] terms = new String[] { "the first string", "the second one" };

    Directory dir = index(terms);

    NGramSearcher ngs = null;
    try {
        ngs = new NGramSearcher(dir);
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    TopDocs td = ngs.search("second");
    System.out.println(td.totalHits);

    for (ScoreDoc sd : td.scoreDocs) {
        System.out.println(sd.doc + "---" + sd.score);
        try {
            System.out.println(ngs.doc(sd.doc).getFieldable("NGRAM_FIELD").
            tokenStreamValue());

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

}

Answer 1

要检查的第一件事是您是否实际在索引时存储此字段。如果你只是索引它，这是预期的结果。

Fieldable.tokenStreamValue（）为标记化字段返回null

1 个答案: