由Shingle过滤器生成的标记不包含在查询中 - Lucene

时间:2017-07-24 09:58:52

标签: java lucene

public class CustomAnalyzer extends Analyzer {
    public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
    private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

    @Override
    protected Analyzer.TokenStreamComponents createComponents(final String fieldName,final Reader reader) {
        final ClassicTokenizer src = new ClassicTokenizer(getVersion(), reader);
        src.setMaxTokenLength(maxTokenLength);
        TokenStream  tok = new ShingleFilter(src,2,3);
        tok = new ClassicFilter(tok);
        tok = new LowerCaseFilter(tok);
    //    tok = new SynonymFilter(tok,SynonymDictionary.getSynonymMap(),true);
        return new Analyzer.TokenStreamComponents(src, tok) {
            @Override
            protected void setReader(final Reader reader) throws IOException {
                src.setMaxTokenLength(CustomAnalyzer.this.maxTokenLength);
                super.setReader(reader);
            }
        };
    }
}


public class Test {
    public static void main(String[] args) throws Exception {
        Directory dir = new NIOFSDirectory(new File("/home/local/test"));
        IndexReader indexReader = DirectoryReader.open(dir);
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        CustomAnalyzer analyzer1 = new CustomAnalyzer();
        TokenStream ts=new CustomSynonymAnalyzer().tokenStream("n",new StringReader("cup board"));
        ts.reset();
        System.out.println("Tokens are :");
        while (ts.incrementToken()) {
           System.out.print(ts.getAttribute(CharTermAttribute.class) + ", ");
        }
        QueryParser parser = new QueryParser("n", analyzer1);
        Query query = null;
        query = parser.parse("cup board");
        System.out.println("\nQuery is");
        System.out.println(query.toString());
    }
}

我正在使用Lucene 4.10.4。上面代码的输出是,

Tokens are :
cup, cup board, board 
Query is
n:cup n:board

我希望得到的查询是 n:cup n:board n:cup board 。但shingle filter形成的令牌未附加在查询中。我只得到 n:cup n:board 。我的错误在哪里?

1 个答案:

答案 0 :(得分:0)

这些令牌不会被分析器拆分,它们会被QueryParser语法拆分。它们是单独的查询子句,而不是单独的术语,因为子句用空格分隔。

尝试使用词组查询,以查看差异:parser.parse("\"cup board\"");