我想在Lucene中计算一个字段上的术语数量。 我知道有3种方法可以做到这一点;我很好奇最好和最快的做法是什么:
我将在一个长类型的单值字段(“字段”)中搜索该术语(所以不是文本,而是编号数据!)
Directory dirIndex = FSDirectory.open('/path/to/index/');
IndexReader indexReader = DirectoryReader.open(dirIndex);
final BytesRefBuilder bytes = new BytesRefBuilder();
NumericUtils.longToPrefixCoded(Long.valueOf(longTerm).longValue(),0,bytes);
TermsEnum termEnum = MultiFields.getTerms(indexReader, "field").iterator(null);
termEnum.seekExact(bytes.toBytesRef());
int count = termEnum.docFreq();
IndexSearcher searcher = new IndexSearcher(indexReader);
TermQuery query = new TermQuery(new Term("field", bytes.toBytesRef()));
TotalHitCountCollector collector = new TotalHitCountCollector();
searcher.search(query,collector);
int count = collector.getTotalHits();
TermsEnum termEnum = MultiFields.getTerms(indexReader, "field").iterator(null);
termEnum.seekExact(bytes.toBytesRef());
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
DocsEnum docsEnum = termEnum.docs(liveDocs, null);
int count = 0;
if (docsEnum != null) {
int docx;
while ((docx = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
count++;
}
}
选项1)获得最短代码,但如果您更新和删除索引中的文档,则基本无用。它会将已删除的文档统计为好像它们仍在那里。许多地方没有记录(官方文档除外,但在s.o.的答案中没有记录),这是需要注意的。也许有一种解决方法,否则对这种方法的热情有点错位。 选项2)和3)确实产生了正确的结果,但哪个应该是首选?或者更好 - 有更快的方法吗?
答案 0 :(得分:2)
通过测试来衡量,使用索引来获取文档而不是搜索它们(即选项3而不是选项2)似乎更快(平均值:选项3)在100中快8倍doc sample 我可以运行)。 我也改变了测试,以确保在另一个之前运行一个不会影响结果:它没有。
所以看起来搜索者正在创建一些执行简单文档计数的开销,如果考虑计算单个术语条目的文档,则索引中的查找速度最快。
用于测试的代码(使用SOLR索引中的前100条记录):
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.util.Bits;
import org.apache.lucene.index.MultiFields;
public class ReadLongTermReferenceCount {
public static void main(String[] args) throws IOException {
Directory dirIndex = FSDirectory.open('/path/to/index/');
IndexReader indexReader = DirectoryReader.open(dirIndex);
final BytesRefBuilder bytes = new BytesRefBuilder();
TermsEnum termEnum = MultiFields.getTerms(indexReader, "field").iterator(null);
IndexSearcher searcher = new IndexSearcher(indexReader);
TotalHitCountCollector collector = new TotalHitCountCollector();
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
final BytesRefBuilder bytes = new BytesRefBuilder(); // for reuse!
int maxDoc = indexReader.maxDoc();
int docsPassed = 0;
for (int i=0; i<maxDoc; i++) {
if (docsPassed==100) {
break;
}
if (liveDocs != null && !liveDocs.get(i))
continue;
Document doc = indexReader.document(i);
//get longTerm from this doc and convert to BytesRefBuilder
String longTerm = doc.get("longTerm");
NumericUtils.longToPrefixCoded(Long.valueOf(longTerm).longValue(),0,bytes);
//time before the first test
long time_start = System.nanoTime();
//look in the "field" index for longTerm and count the number of documents
int count = 0;
termEnum.seekExact(bytes.toBytesRef());
DocsEnum docsEnum = termEnum.docs(liveDocs, null);
if (docsEnum != null) {
int docx;
while ((docx = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
count++;
}
}
//mid point: test 1 done, start of test 2
long time_mid = System.nanoTime();
//do a search for longTerm in "field"
TermQuery query = new TermQuery(new Term("field", bytes.toBytesRef()));
searcher.search(query,collector);
int count = collector.getTotalHits();
//end point: test 2 done.
long time_end = System.nanoTime();
//write to stdout
System.out.println(longTerm+"\t"+(time_mid-time_start)+"\t"+(time_end-time_mid));
docsPassed++;
}
indexReader.close();
dirIndex.close();
}
}
稍微修改上面的内容以使用Lucene 5:
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.util.Bits;
import org.apache.lucene.index.MultiFields;
public class ReadLongTermReferenceCount {
public static void main(String[] args) throws IOException {
Directory dirIndex = FSDirectory.open(Paths.get('/path/to/index/'));
IndexReader indexReader = DirectoryReader.open(dirIndex);
final BytesRefBuilder bytes = new BytesRefBuilder();
TermsEnum termEnum = MultiFields.getTerms(indexReader, "field").iterator(null);
IndexSearcher searcher = new IndexSearcher(indexReader);
TotalHitCountCollector collector = new TotalHitCountCollector();
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
final BytesRefBuilder bytes = new BytesRefBuilder(); // for reuse!
int maxDoc = indexReader.maxDoc();
int docsPassed = 0;
for (int i=0; i<maxDoc; i++) {
if (docsPassed==100) {
break;
}
if (liveDocs != null && !liveDocs.get(i))
continue;
Document doc = indexReader.document(i);
//get longTerm from this doc and convert to BytesRefBuilder
String longTerm = doc.get("longTerm");
NumericUtils.longToPrefixCoded(Long.valueOf(longTerm).longValue(),0,bytes);
//time before the first test
long time_start = System.nanoTime();
//look in the "field" index for longTerm and count the number of documents
int count = 0;
termEnum.seekExact(bytes.toBytesRef());
PostingsEnum docsEnum = termEnum.postings(liveDocs, null);
if (docsEnum != null) {
int docx;
while ((docx = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
count++;
}
}
//mid point: test 1 done, start of test 2
long time_mid = System.nanoTime();
//do a search for longTerm in "field"
TermQuery query = new TermQuery(new Term("field", bytes.toBytesRef()));
searcher.search(query,collector);
int count = collector.getTotalHits();
//end point: test 2 done.
long time_end = System.nanoTime();
//write to stdout
System.out.println(longTerm+"\t"+(time_mid-time_start)+"\t"+(time_end-time_mid));
docsPassed++;
}
indexReader.close();
dirIndex.close();
}
}