从Lucene找到搜索命中的位置

时间:2009-08-21 10:36:41

标签: java search lucene

使用Lucene,在搜索结果中找到匹配项的推荐方法是什么?

更具体地说,假设索引文档有一个字段“fullText”,它存储某些文档的纯文本内容。此外,假设对于这些文件中的一个,内容是“快速的棕色狐狸跳过懒狗”。接下来,搜索“狐狸狗”。显然,这份文件很受欢迎。

在这种情况下,可以使用Lucene为找到的文档提供类似匹配区域的内容吗?所以对于这种情况,我想产生类似的东西:

[{match: "fox", startIndex: 10, length: 3},
 {match: "dog", startIndex: 34, length: 3}]

我怀疑它可以通过org.apache.lucene.search.highlight包中提供的内容来实现。我不确定整体方法......

2 个答案:

答案 0 :(得分:10)

我使用的是TermFreqVector。这是一个工作演示,它打印术语位置,以及起始和结束术语索引:

public class Search {
    public static void main(String[] args) throws IOException, ParseException {
        Search s = new Search();  
        s.doSearch(args[0], args[1]);  
    }  

    Search() {
    }  

    public void doSearch(String db, String querystr) throws IOException, ParseException {
        // 1. Specify the analyzer for tokenizing text.  
        //    The same analyzer should be used as was used for indexing  
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);  

        Directory index = FSDirectory.open(new File(db));  

        // 2. query  
        Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(querystr);  

        // 3. search  
        int hitsPerPage = 10;  
        IndexSearcher searcher = new IndexSearcher(index, true);  
        IndexReader reader = IndexReader.open(index, true);  
        searcher.setDefaultFieldSortScoring(true, false);  
        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);  
        searcher.search(q, collector);  
        ScoreDoc[] hits = collector.topDocs().scoreDocs;  

        // 4. display term positions, and term indexes   
        System.out.println("Found " + hits.length + " hits.");  
        for(int i=0;i<hits.length;++i) {  

            int docId = hits[i].doc;  
            TermFreqVector tfvector = reader.getTermFreqVector(docId, "contents");  
            TermPositionVector tpvector = (TermPositionVector)tfvector;  
            // this part works only if there is one term in the query string,  
            // otherwise you will have to iterate this section over the query terms.  
            int termidx = tfvector.indexOf(querystr);  
            int[] termposx = tpvector.getTermPositions(termidx);  
            TermVectorOffsetInfo[] tvoffsetinfo = tpvector.getOffsets(termidx);  

            for (int j=0;j<termposx.length;j++) {  
                System.out.println("termpos : "+termposx[j]);  
            }  
            for (int j=0;j<tvoffsetinfo.length;j++) {  
                int offsetStart = tvoffsetinfo[j].getStartOffset();  
                int offsetEnd = tvoffsetinfo[j].getEndOffset();  
                System.out.println("offsets : "+offsetStart+" "+offsetEnd);  
            }  

            // print some info about where the hit was found...  
            Document d = searcher.doc(docId);  
            System.out.println((i + 1) + ". " + d.get("path"));  
        }  

        // searcher can only be closed when there  
        // is no need to access the documents any more.   
        searcher.close();  
    }      
}

答案 1 :(得分:3)

这是lucene 5.2.1的解决方案。它仅适用于单字查询,但应展示基本原则。

基本理念是:

  1. 为每个与您的查询相匹配的文档获取TokenStream
  2. 创建QueryScorer并使用检索到的tokenStream初始化它。
  3. &#39;环路&#39;在流的每个标记上(由tokenStream.incrementToken()完成)并检查标记是否与搜索条件匹配(由queryScorer.getTokenScore()完成)。
  4. 以下是代码:

    import java.io.IOException;
    import java.util.List;
    import java.util.Vector;
    
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.de.GermanAnalyzer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.search.highlight.TokenSources;
    
    public class OffsetSearcher {
    
        private IndexReader reader;
    
        public OffsetSearcher(IndexWriter indexWriter) throws IOException { 
            reader = DirectoryReader.open(indexWriter, true); 
        }
    
        public OffsetData[] getTermOffsets(Query query) throws IOException, InvalidTokenOffsetsException 
        {
            List<OffsetData> result = new Vector<>();
    
            IndexSearcher searcher = new IndexSearcher(reader);
            TopDocs topDocs = searcher.search(query, 1000);
    
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;   
    
            Document doc;
            TokenStream tokenStream;
            CharTermAttribute termAtt;
            OffsetAttribute offsetAtt;
            QueryScorer queryScorer;
            OffsetData offsetData;
            String txt, tokenText;
            for (int i = 0; i < scoreDocs.length; i++) 
            {
                int docId = scoreDocs[i].doc;
                doc = reader.document(docId);
    
                txt = doc.get(RunSearch.CONTENT);
                tokenStream = TokenSources.getTokenStream(RunSearch.CONTENT, reader.getTermVectors(docId), txt, new GermanAnalyzer(), -1);
    
                termAtt = (CharTermAttribute)tokenStream.addAttribute(CharTermAttribute.class);
                offsetAtt = (OffsetAttribute)tokenStream.addAttribute(OffsetAttribute.class);
    
                queryScorer = new QueryScorer(query);
                queryScorer.setMaxDocCharsToAnalyze(RunSearch.MAX_DOC_CHARS);
                TokenStream newStream  = queryScorer.init(tokenStream);
                if (newStream != null) {
                    tokenStream = newStream;
                }
                queryScorer.startFragment(null);
    
                tokenStream.reset();
    
                int startOffset, endOffset;
                for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < RunSearch.MAX_DOC_CHARS); next = tokenStream.incrementToken())
                {
                    startOffset = offsetAtt.startOffset();
                    endOffset = offsetAtt.endOffset();
    
                    if ((endOffset > txt.length()) || (startOffset > txt.length()))
                    {
                        throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + txt.length());
                    }
    
                    float res = queryScorer.getTokenScore();
                    if (res > 0.0F && startOffset <= endOffset) {
                        tokenText = txt.substring(startOffset, endOffset);
                        offsetData = new OffsetData(tokenText, startOffset, endOffset, docId);
                        result.add(offsetData);
                    }           
                }   
            }
    
            return result.toArray(new OffsetData[result.size()]);
        }
    
    
        public void close() throws IOException {
            reader.close();
        }
    
    
        public static class OffsetData {
    
            public String phrase;
            public int startOffset;
            public int endOffset;
            public int docId;
    
            public OffsetData(String phrase, int startOffset, int endOffset, int docId) {
                super();
                this.phrase = phrase;
                this.startOffset = startOffset;
                this.endOffset = endOffset;
                this.docId = docId;
            }
    
        }
    
    }