我想知道Lucene 4索引中每个术语的出现次数(有一些例子说明如何使用较旧的 Lucene版本,但据我们所知,他们完全改变了他们的每个新修订版都有API,所以那些方法根本不能帮助我。
我现在拥有的:
IndexReader indexReader = DirectoryReader.open(directory);
Fields fields = MultiFields.getFields(indexReader);
for (String field : fields) {
Terms terms = fields.terms(field);
TermsEnum termEnum = terms.iterator(null);
BytesRef bytesRef;
while ((bytesRef = termEnum.next()) != null) {
// what to do here? I have a BytesRef and there is
// IndexReader#docFreq(term) but it requires a Term
}
}
答案 0 :(得分:4)
FSDirectory directory = FSDirectory.open(new File("/tmp/moo"));
/*
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)));
Document document = new Document();
document.add(new TextField("foo", "abc", Store.YES));
document.add(new TextField("foo", "abc", Store.YES));
document.add(new TextField("foo", "aaa", Store.YES));
document.add(new TextField("bar", "abc", Store.YES));
writer.addDocument(document);
writer.commit();
writer.close(true);
*/
IndexReader indexReader = DirectoryReader.open(directory);
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
Fields fields = MultiFields.getFields(indexReader);
for (String field : fields) {
TermsEnum termEnum = MultiFields.getTerms(indexReader, field).iterator(null);
BytesRef bytesRef;
while ((bytesRef = termEnum.next()) != null) {
if (termEnum.seekExact(bytesRef, true)) {
DocsEnum docsEnum = termEnum.docs(liveDocs, null);
if (docsEnum != null) {
int doc;
while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
System.out.println(bytesRef.utf8ToString() + " in doc " + doc + ": " + docsEnum.freq());
}
}
}
}
}
for (String field : fields) {
TermsEnum termEnum = MultiFields.getTerms(indexReader, field).iterator(null);
BytesRef bytesRef;
while ((bytesRef = termEnum.next()) != null) {
int freq = indexReader.docFreq(new Term(field, bytesRef));
System.out.println(bytesRef.utf8ToString() + " in " + freq + " documents");
}
}