以倒排索引格式打印lucene

时间:2017-05-21 08:50:44

标签: lucene inverted-index

根据我的理解,Lucene使用倒排索引。有没有办法以反向索引格式提取/打印lucene索引(lucene 6):

term1   <doc1, doc100, ..., doc555>
term1   <doc1, ..., doc100, ..., do89>
term1   <doc3, doc2, doc5, ...>
.
.
.
termn   <doc10, doc43, ..., dock>

3 个答案:

答案 0 :(得分:1)

我正在使用Lucene 6.x.x而且我不确定任何简单的方法,但解决方案比没有解决方案更好。这样的事情对我有用 - MatchAllDocsQuery

private static void printWholeIndex(IndexSearcher searcher) throws IOException{
        MatchAllDocsQuery query = new MatchAllDocsQuery();
        TopDocs hits = searcher.search(query, Integer.MAX_VALUE);

        Map<String, Set<Integer>>  invertedIndex = new HashMap<>();


        if (null == hits.scoreDocs || hits.scoreDocs.length <= 0) {
            System.out.println("No Hits Found with MatchAllDocsQuery");
            return;
        }

        for (ScoreDoc hit : hits.scoreDocs) {
            Document doc = searcher.doc(hit.doc);

            List<IndexableField> allFields = doc.getFields();

            for(IndexableField field:allFields){



            //Single document inverted index 
            Terms terms = searcher.getIndexReader().getTermVector(hit.doc,field.name());

            if (terms != null )  {
                TermsEnum termsEnum = terms.iterator();
                while(termsEnum.next() != null){
                if(invertedIndex.containsKey(termsEnum.term().utf8ToString())){
                    Set<Integer> existingDocs = invertedIndex.get(termsEnum.term().utf8ToString());
                    existingDocs.add(hit.doc);
                    invertedIndex.put(termsEnum.term().utf8ToString(),existingDocs);

                }else{
                    Set<Integer> docs = new TreeSet<>();
                    docs.add(hit.doc);
                    invertedIndex.put(termsEnum.term().utf8ToString(), docs);
                }
                }
            }
        }
        }

        System.out.println("Printing Inverted Index:");

        invertedIndex.forEach((key , value) -> {System.out.println(key+":"+value);
        });
    }

两点,

1.支持最大文件 - Integer.MAX_VALUE。我没有尝试,但可能,使用searchAfter搜索器方法并执行多次搜索可以消除此限制。

2. doc.getFields()仅返回存储的字段。如果所有索引字段都未存储,则可以保留静态字段数组,因为行Terms terms = searcher.getIndexReader().getTermVector(hit.doc,field.name());也适用于非存储字段。

答案 1 :(得分:1)

您可以使用TermEnum迭代倒排索引中的术语。然后,对于每个术语,您应该使用其PostingsEnum迭代过账。如果您的索引具有单个段(Lucene版本:6_5_1),则以下代码将起作用:

String indexPath = "your_index_path"
String field = "your_index_field"
try (FSDirectory directory = FSDirectory.open(Paths.get(indexPath));
            IndexReader reader = DirectoryReader.open(directory)) {
        Terms terms = MultiFields.getTerms(reader, field);
        final TermsEnum it = terms.iterator();
        BytesRef term = it.next();
        while (term != null) {
            String termString = term.utf8ToString();
            System.out.print(termStirng + ": ");
            for (LeafReaderContext lrc : reader.leaves()) {
                LeafReader lr = lrc.reader();
                PostingsEnum pe = lr.postings(new Term(field, termString));
                int docId = pe.nextDoc();
                while (docId != PostingsEnum.NO_MORE_DOCS) {
                    postingSize++;
                    Document doc = lr.document(docId);
                    // here you can print your document title, id, etc
                    docId = pe.nextDoc();
                }
            }
            term = it.next();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

如果你的索引有多个段,那么$ reader.leaves()$将返回其他读者作为他们的叶子的读者(想象一下索引读者树)。在这种情况下,您应该遍历树以获取叶子并在每个叶子的for循环内重复代码。

答案 2 :(得分:0)

制作了一个版本,为Lucene 6.6打印docId:tokenPos。

Directory directory = new RAMDirectory();
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(directory, iwc);

FieldType type = new FieldType();
type.setStoreTermVectors(true);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectorOffsets(true);
type.setIndexOptions(IndexOptions.DOCS);

Field fieldStore = new Field("text", "We hold that proof beyond a reasonable doubt is required.", type);
Document doc = new Document();
doc.add(fieldStore);
writer.addDocument(doc);

fieldStore = new Field("text", "We hold that proof requires reasoanble preponderance of the evidenceb.", type);
doc = new Document();
doc.add(fieldStore);
writer.addDocument(doc);

writer.close();

DirectoryReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);

MatchAllDocsQuery query = new MatchAllDocsQuery();
TopDocs hits = searcher.search(query, Integer.MAX_VALUE);

Map<String, Set<String>> invertedIndex = new HashMap<>();
BiFunction<Integer, Integer, Set<String>> mergeValue = 
    (docId, pos)-> {TreeSet<String> s = new TreeSet<>(); s.add((docId+1)+":"+pos); return s;};

for ( ScoreDoc scoreDoc: hits.scoreDocs ) {
    Fields termVs = reader.getTermVectors(scoreDoc.doc);
    Terms terms = termVs.terms("text");
    TermsEnum termsIt = terms.iterator();
    PostingsEnum docsAndPosEnum = null;
    BytesRef bytesRef;
    while ( (bytesRef = termsIt.next()) != null ) {
        docsAndPosEnum = termsIt.postings(docsAndPosEnum, PostingsEnum.ALL);
        docsAndPosEnum.nextDoc();
        int pos = docsAndPosEnum.nextPosition();
        String term = bytesRef.utf8ToString();
        invertedIndex.merge(
            term, 
            mergeValue.apply(scoreDoc.doc, pos), 
            (s1,s2)->{s1.addAll(s2); return s1;}
        );
    }
}
System.out.println( invertedIndex);