我们当前使用的是Lucene V 4.X
的非常旧的版本,现在正在迁移到Solr V 7.4.0
云。我们有一个自定义的相似度类,可通过文档中使用的索引字段(“ RANK”)来影响得分。
这是类的外观-
CustomSimilarity.java
public class CustomSimilarity extends Similarity {
private final Similarity sim;
private final double coefficiency;
private String popularityRank;
static InfoStream infoStream;
public CustomSimilarity() {
this.sim = new CustomPayloadSimilarity();
this.coefficiency = 0.1;
this.popularityRank = "RANK";
infoStream = new LoggingInfoStream();
}
@Override
public long computeNorm(FieldInvertState state) {
return sim.computeNorm(state);
}
@Override
public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
final Explanation idf = termStats.length == 1 ? ((PclnPayloadSimilarity) sim).idfExplain(collectionStats, termStats[0]) : ((PclnPayloadSimilarity) sim)
.idfExplain(collectionStats, termStats);
float[] normTable = new float[256];
for (int i = 1; i < 256; ++i) {
int length = SmallFloat.byte4ToInt((byte) i);
float norm = ((PclnPayloadSimilarity) sim).lengthNorm(length);
normTable[i] = norm;
}
normTable[0] = 1f / normTable[255];
return new IDFStats(collectionStats.field(), queryBoost, idf, normTable);
}
public float sloppyFreq(int distance) {
return 1.0f / (distance + 1);
}
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1;
}
@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
final IDFStats idfstats = (IDFStats) weight;
final NumericDocValues rank1Value = context.reader().getNumericDocValues(popularityRank);
infoStream.message("PCLNSimilarity", "NumericDocValues-1 >> rank1Value = " + rank1Value);
System.out.println("NumericDocValues-1 >> rank1Value = " + rank1Value);
return new SimScorer() {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return super.explain(doc, freq);
}
@Override
public float score(int doc, float freq) throws IOException {
// float weightValue = idfstats.queryWeight;
// // logger.trace("weight " + weightValue + "freq " + freq);
//
// float score = 0.0f;
// if (rank1Value != null) {
// score = (float) rank1Value.longValue() + score;
// }
//
// if (coefficiency > 0) {
// score = score + (float) coefficiency * weightValue;
// }
// return score;
return (float) rank1Value.longValue();
}
@Override
public float computeSlopFactor(int distance) {
return sloppyFreq(distance);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return scorePayload(doc, start, end, payload);
}
};
}
static class IDFStats extends SimWeight {
private final String field;
/** The idf and its explanation */
private final Explanation idf;
private final float boost;
private final float queryWeight;
final float[] normTable;
public IDFStats(String field, float boost, Explanation idf, float[] normTable) {
// TODO: Validate?
this.field = field;
this.idf = idf;
this.boost = boost;
this.queryWeight = boost * idf.getValue();
this.normTable = normTable;
}
}
}
CustomPayloadSimilarity.java
public class CustomPayloadSimilarity extends ClassicSimilarity {
@Override
public float tf(float freq) {
return 1;
}
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
if (payload != null) {
return PayloadHelper.decodeFloat(payload.bytes, payload.offset);
} else {
return 1.0F;
}
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
}
}
您会注意到,由于我们希望保留旧的和较新的TFIDF实现之间的奇偶性(某种程度上),因此我们仍在使用较旧的算法,因此没有切换到BM25相似性。
使用上面的代码,我无法从文档中检索RANK字段的值。因此从本质上讲,以下行返回的值我无法登录到solr.log文件-
final NumericDocValues rank1Value = context.reader().getNumericDocValues(popularityRank);
但是return (float) rank1Value.longValue()
抛出以下异常-
"java.lang.IndexOutOfBoundsException
at java.nio.Buffer.checkIndex(Buffer.java:546)
at java.nio.DirectByteBuffer.getInt(DirectByteBuffer.java:685)
at org.apache.lucene.store.ByteBufferGuard.getInt(ByteBufferGuard.java:128)
at org.apache.lucene.store.ByteBufferIndexInput$SingleBufferImpl.readInt(ByteBufferIndexInput.java:415)
at org.apache.lucene.util.packed.DirectReader$DirectPackedReader28.get(DirectReader.java:248)
at org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer$4.longValue(Lucene70DocValuesProducer.java:490)
at com.priceline.rc.solr.similarity.CustomSimilarity$1.score(CustomSimilarity.java:117)
at org.apache.lucene.search.TermScorer.score(TermScorer.java:65)
at org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1.collect(TopScoreDocCollector.java:64)
at org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll(Weight.java:263)
at org.apache.lucene.search.Weight$DefaultBulkScorer.score(Weight.java:214)
at org.apache.lucene.search.BulkScorer.score(BulkScorer.java:39)
at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:662)
at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:463)
at org.apache.solr.search.SolrIndexSearcher.buildAndRunCollectorChain(SolrIndexSearcher.java:217)
at org.apache.solr.search.SolrIndexSearcher.getDocListNC(SolrIndexSearcher.java:1622)
at org.apache.solr.search.SolrIndexSearcher.getDocListC(SolrIndexSearcher.java:1439)
at org.apache.solr.search.SolrIndexSearcher.search(SolrIndexSearcher.java:586)
at org.apache.solr.handler.component.QueryComponent.doProcessUngroupedSearch(QueryComponent.java:1435)
at org.apache.solr.handler.component.QueryComponent.process(QueryComponent.java:375)
at org.apache.solr.handler.component.SearchHandler.handleRequestBody(SearchHandler.java:298)
at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:199)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:2539)
at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:709)
at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:515)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:377)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:323)
at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1634)
at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:533)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:146)
at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:548)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:257)
at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1595)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:255)
at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1253)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:203)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:473)
at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1564)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:201)
at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1155)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:144)
at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:219)
at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:126)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)
at org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:335)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)
at org.eclipse.jetty.server.Server.handle(Server.java:531)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:352)
at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:260)
at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:281)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:102)
at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:118)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:333)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:310)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:168)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:126)
at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:366)
at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:760)
at org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:678)
at java.lang.Thread.run(Thread.java:745)\n"
有什么建议吗?
答案 0 :(得分:2)
您试图从NumericDocValues获取值,而不用advanceExact()
设置当前文档。请记住,每个文档都有一个用于该帐户的NumericDocValues,在请求值之前,您仍然需要告诉它要引用的文档。在得分函数中,尝试在调用advanceExact(doc)
之前添加rank1Value.longValue()
。
应该是这样的:
if(advanceExact(doc))
return (float) rank1Value.longValue();
else
return 0; // or whatever value you want as default