我在基于Lucene的Java搜索引擎中遇到了严重的版本问题。我已经修改并合并了互联网上太多来源的代码(依赖于这个代码最多http://ipl.cs.aueb.gr/stougiannis/bm25_2.html),这很可能是代码无效的原因。另外,我引用了lucene-analyzers-4.5.1.jar,lucence-core-4.5.1.jar,lucene-demo-4.5.1.jar,lucene-queryparser-4.5.1.jar,lucene-BM25-1.0 .jar,lucene核心2.9,2.4(如说明书中所述)和3.0.3。
这是没有BM25的简单工作代码之一,我将其修改为包括BM25评分并将文档添加到索引中:
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
import org.apache.lucene.queryparser.*;
import org.ninit.models.bm25.*;//5
import java.io.IOException;
public class lucene12 {
public static void main(String[] args) throws IOException, ParseException {
// 0. Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
// 1. create the index
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
IndexWriter w = new IndexWriter(index, config);
addDoc(w, "Lucene in Action", "193398817");
addDoc(w, "Lucene for Dummies", "55320055Z");
addDoc(w, "Managing Gigabytes", "55063554A");
addDoc(w, "The Art of Computer Science", "9900333X");
w.close();
// 2. query
String querystr = args.length > 0 ? args[0] : "lucene";
// the "title" arg specifies the default field to use
// when no field is explicitly specified in the query.
Query q = new QueryParser(Version.LUCENE_40, "title", analyzer).parse(querystr);
// 3. search
int hitsPerPage = 10;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// 4. display results
System.out.println("Found " + hits.length + " hits.");
for(int i=0;i<hits.length;++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
}
// reader can only be closed when there
// is no need to access the documents any more.
reader.close();
}
private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
Document doc = new Document();
doc.add(new TextField("title", title, Field.Store.YES));
// use a string field for isbn because we don't want it tokenized
doc.add(new StringField("isbn", isbn, Field.Store.YES));
w.addDocument(doc);
}
}
这是我的代码(我是java新手):
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
//import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
//import org.apache.lucene.document.StringField;
//import org.apache.lucene.document.TextField;
//import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import java.io.IOException;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.ninit.models.bm25.*;//5
public class Lucene11 {
public static void main(String[] args) throws IOException, ParseException {
IndexReader reader= null;
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); **<<Error here**
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_29, analyzer);
@SuppressWarnings("deprecation")
IndexWriter w = new IndexWriter(index, analyzer);
addDoc(w, "Lucene in Action", "193398817");
addDoc(w, "Lucene for Dummies", "55320055Z");
addDoc(w, "Managing Gigabytes", "55063554A");
addDoc(w, "The Art of Computer Science", "9900333X");
w.close();
BM25BooleanQuery query = null;
try {
query = new BM25BooleanQuery( "lucene" ,"title",analyzer);
} catch (org.apache.lucene.queryParser.ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
int hitsPerPage = 10;
try {
reader=IndexReader.open(index);
}
catch (CorruptIndexException e1) {e1.printStackTrace();}
catch(IOException e1) {e1.printStackTrace();}
String field="title";
Searcher searcher = new IndexSearcher(reader);
BM25Parameters.setAverageLength("title",getAvgLength(reader,"title"));
BM25Parameters.setB(0.75f);
BM25Parameters.setK1(2f);
TopDocs top=searcher.search(query, hitsPerPage);
ScoreDoc[] docs = top.scoreDocs;
for (int i= 0;i<10; i++){
System.out.println("the document with id= " + docs[i].doc + " has score ="+docs[i].score);
}
reader.close();
}
public static float getAvgLength(IndexReader reader,String field) throws IOException{
long sum=0;
for (int i = 0; i < reader.numDocs(); i++){
TermFreqVector tfv=
reader.getTermFreqVector(i, field);
if(tfv!= null) {
int[] tfs=tfv.getTermFrequencies();
for(int j= 0;j < tfv.size(); j++){
sum=sum+tfs[j];
}
}
}
float avg=(float)sum/reader.numDocs();
//System.out.println("average length = " + avg);
return avg;
}//end of method
private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
Document doc = new Document();
doc.add(new TextField("title", title, Field.Store.YES));
// use a string field for isbn because we don't want it tokenized
doc.add(new StringField("isbn", isbn, Field.Store.YES));
w.addDocument(doc);
}
}
错误&#34; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);&#34;专线说:
Exception in thread "main" java.lang.NoSuchFieldError: LUCENE_31
at org.apache.lucene.analysis.util.CharacterUtils.getInstance(CharacterUtils.java:46)
at org.apache.lucene.analysis.util.CharArrayMap.<init>(CharArrayMap.java:85)
at org.apache.lucene.analysis.util.CharArrayMap$EmptyCharArrayMap.<init>(CharArrayMap.java:662)
at org.apache.lucene.analysis.util.CharArrayMap.<clinit>(CharArrayMap.java:55)
at org.apache.lucene.analysis.util.CharArraySet.<clinit>(CharArraySet.java:59)
at org.apache.lucene.analysis.core.StopAnalyzer.<clinit>(StopAnalyzer.java:58)
at org.apache.lucene.analysis.standard.StandardAnalyzer.<clinit>(StandardAnalyzer.java:64)
at Lucene11.main(Lucene11.java:55)
修改了分数功能:
package org.ninit.models.bm25;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;
public class BM25TermScorer extends Scorer {
private TermQuery term;
private IndexReader reader;
private TermDocs termDocs;
private float idf;
private float av_length;
private byte[] norm;
private float b;
private float k1;
public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity)
throws IOException {
super(similarity);
this.reader = reader;
this.term = term;
this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs());
this.norm = this.reader.norms(this.term.getTerm().field());
this.av_length = BM25Parameters.getAverageLength(this.term.getTerm().field());
this.b = BM25Parameters.getB();
this.k1 = BM25Parameters.getK1();
this.termDocs = this.reader.termDocs(this.term.getTerm());
}
@Override
public int doc() {
return this.termDocs.doc();
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.search.Scorer#explain(int)
*/
@Override
public Explanation explain(int doc) throws IOException {
// Init termDocs
if (this.termDocs != null)
this.termDocs.close();
this.termDocs = this.reader.termDocs(this.term.getTerm());
// skipTo doc
//
if (!this.skipTo(doc))
return null;
float length = 0f;
byte[] norm = this.reader.norms(this.term.getTerm().field());
float av_length = BM25Parameters.getAverageLength(this.term.getTerm().field());
length = 1 / ((Similarity.decodeNorm(norm[this.doc()])) * (Similarity.decodeNorm(norm[this
.doc()])));
float tf = this.termDocs.freq();
float result = BM25Parameters.getB() * (length / av_length);
result = result + 1 - BM25Parameters.getB();
result = tf / result;
// FREQ SATURATION
result = result / (result + BM25Parameters.getK1());
Explanation idfE = new Explanation(this.idf, " idf (docFreq:"
+ this.reader.docFreq(this.term.getTerm()) + ",numDocs:" + this.reader.numDocs()
+ ")");
Explanation bE = new Explanation(result, "B:" + BM25Parameters.getB() + ",Length:" + length
+ ",AvgLength:" + av_length + ",Freq:" + tf + ",K1:" + BM25Parameters.getK1());
Explanation resultE = new Explanation(this.idf * result, "BM25("
+ this.term.getTerm().field() + ":" + this.term.getTerm().text());
resultE.addDetail(idfE);
resultE.addDetail(bE);
return resultE;
}
@Override
public boolean next() throws IOException {
boolean result = this.termDocs.next();
if (!result)
this.termDocs.close();
return result;
}
public float score()throws IOException{
//IDF refers to the inverse document frequency (idf(qi,d)) and
//TF25 refers to the second factor in the definition of the BM25 scoring function
float TF25;
float num25;
float den25;
float length;
float norm = Similarity.decodeNorm(this.norm[this.doc()]);
length = 1 / (norm * norm);
den25= this.b*(length / this.av_length);
den25= 1-this.b+den25;
den25= this.k1*den25;
den25= this.termDocs.freq()+den25;
num25= this.k1+1;
num25= num25*this.termDocs.freq();
TF25= num25/den25;
return TF25*this.idf;
}//end of score
/*
* (non-Javadoc)
*
* @see org.apache.lucene.search.Scorer#skipTo(int)
*/
@Override
public boolean skipTo(int target) throws IOException {
while (this.next() && this.doc() < target) {
}
return this.doc() == target;
}
public int freq() throws IOException {
// TODO Auto-generated method stub
return 0;
}
public int advance(int arg0) throws IOException {
// TODO Auto-generated method stub
return 0;
}
public long cost() {
// TODO Auto-generated method stub
return 0;
}
public int docID() {
// TODO Auto-generated method stub
return 0;
}
public int nextDoc() throws IOException {
// TODO Auto-generated method stub
return 0;
}
}
错误是什么意思?
答案 0 :(得分:1)
你真的无法混合和匹配lucene版本,你当然不能同时使用多个版本的Lucene核心。你说你现在在你的类路径中有Lucene核心2.4,2.9,3.0.3和4.5.1。那真的不行。看起来你想要使用Lucene 4.5.1版,所以摆脱其他三个lucene核心罐。您可能需要修改已使用旧版本复制的代码,以使其与4.X兼容(错误中显示的代码:不再有Version.LUCENE_CURRENT
,您必须指定特定版本)。您可以找到有关3.6 - 4.0更改内容以及如何在migration guide中迁移的资源。
就BM25评分实施而言,现在(截至4.0)在lucene核心中实现了BM25Similarity
。