基于Java,Lucene,BM25的搜索引擎中的版本问题和错误

时间:2014-04-26 05:12:24

标签: java apache lucene search-engine

我在基于Lucene的Java搜索引擎中遇到了严重的版本问题。我已经修改并合并了互联网上太多来源的代码(依赖于这个代码最多http://ipl.cs.aueb.gr/stougiannis/bm25_2.html),这很可能是代码无效的原因。另外,我引用了lucene-analyzers-4.5.1.jar,lucence-core-4.5.1.jar,lucene-demo-4.5.1.jar,lucene-queryparser-4.5.1.jar,lucene-BM25-1.0 .jar,lucene核心2.9,2.4(如说明书中所述)和3.0.3。

这是没有BM25的简单工作代码之一,我将其修改为包括BM25评分并将文档添加到索引中:

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
import org.apache.lucene.queryparser.*;
import org.ninit.models.bm25.*;//5

import java.io.IOException;

public class lucene12   {
  public static void main(String[] args) throws IOException, ParseException {
    // 0. Specify the analyzer for tokenizing text.
    //    The same analyzer should be used for indexing and searching
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);

    // 1. create the index
    Directory index = new RAMDirectory();

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);

    IndexWriter w = new IndexWriter(index, config);
    addDoc(w, "Lucene in Action", "193398817");
    addDoc(w, "Lucene for Dummies", "55320055Z");
    addDoc(w, "Managing Gigabytes", "55063554A");
    addDoc(w, "The Art of Computer Science", "9900333X");
    w.close();

    // 2. query
    String querystr = args.length > 0 ? args[0] : "lucene";

    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query q = new QueryParser(Version.LUCENE_40, "title", analyzer).parse(querystr);

    // 3. search
    int hitsPerPage = 10;
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
    searcher.search(q, collector);
    ScoreDoc[] hits = collector.topDocs().scoreDocs;

    // 4. display results
    System.out.println("Found " + hits.length + " hits.");
    for(int i=0;i<hits.length;++i) {
      int docId = hits[i].doc;
      Document d = searcher.doc(docId);
      System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
    }

    // reader can only be closed when there
    // is no need to access the documents any more.
    reader.close();
  }

  private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
    Document doc = new Document();
    doc.add(new TextField("title", title, Field.Store.YES));

    // use a string field for isbn because we don't want it tokenized
    doc.add(new StringField("isbn", isbn, Field.Store.YES));
    w.addDocument(doc);
  }
}

这是我的代码(我是java新手):

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
//import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
//import org.apache.lucene.document.StringField;
//import org.apache.lucene.document.TextField;
//import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import java.io.IOException;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.ninit.models.bm25.*;//5

public class Lucene11  {





public static void main(String[] args) throws IOException, ParseException {


         IndexReader reader= null; 
         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); **<<Error here**
         Directory index = new RAMDirectory();
         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_29, analyzer);

        @SuppressWarnings("deprecation")
        IndexWriter w = new IndexWriter(index, analyzer);
        addDoc(w, "Lucene in Action", "193398817");
        addDoc(w, "Lucene for Dummies", "55320055Z");
        addDoc(w, "Managing Gigabytes", "55063554A");
        addDoc(w, "The Art of Computer Science", "9900333X");
        w.close();


        BM25BooleanQuery query = null;

        try {
            query = new BM25BooleanQuery( "lucene" ,"title",analyzer);
        } catch (org.apache.lucene.queryParser.ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        int hitsPerPage = 10;

       try {
          reader=IndexReader.open(index);
       }
       catch (CorruptIndexException e1) {e1.printStackTrace();}
       catch(IOException e1) {e1.printStackTrace();}
       String field="title";
       Searcher searcher = new IndexSearcher(reader);

    BM25Parameters.setAverageLength("title",getAvgLength(reader,"title"));
    BM25Parameters.setB(0.75f);
    BM25Parameters.setK1(2f);

    TopDocs top=searcher.search(query, hitsPerPage);
       ScoreDoc[] docs = top.scoreDocs;
       for (int i= 0;i<10; i++){
          System.out.println("the document with id= " + docs[i].doc + " has score ="+docs[i].score);
       } 

    reader.close();
  }


  public static float getAvgLength(IndexReader reader,String field) throws IOException{
     long sum=0;
     for (int i = 0; i < reader.numDocs(); i++){
       TermFreqVector tfv=
                reader.getTermFreqVector(i, field);
        if(tfv!= null) {
           int[] tfs=tfv.getTermFrequencies();
           for(int j= 0;j < tfv.size(); j++){
              sum=sum+tfs[j];
           }
        }
     } 
     float avg=(float)sum/reader.numDocs(); 
     //System.out.println("average length = " + avg);
     return avg;
  }//end of method


  private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
    Document doc = new Document();
    doc.add(new  TextField("title", title, Field.Store.YES));

    // use a string field for isbn because we don't want it tokenized
    doc.add(new StringField("isbn", isbn, Field.Store.YES));
    w.addDocument(doc);
  }
}

错误&#34; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);&#34;专线说:

Exception in thread "main" java.lang.NoSuchFieldError: LUCENE_31
at org.apache.lucene.analysis.util.CharacterUtils.getInstance(CharacterUtils.java:46)
at org.apache.lucene.analysis.util.CharArrayMap.<init>(CharArrayMap.java:85)
at org.apache.lucene.analysis.util.CharArrayMap$EmptyCharArrayMap.<init>(CharArrayMap.java:662)
at org.apache.lucene.analysis.util.CharArrayMap.<clinit>(CharArrayMap.java:55)
at org.apache.lucene.analysis.util.CharArraySet.<clinit>(CharArraySet.java:59)
at org.apache.lucene.analysis.core.StopAnalyzer.<clinit>(StopAnalyzer.java:58)
at org.apache.lucene.analysis.standard.StandardAnalyzer.<clinit>(StandardAnalyzer.java:64)
at Lucene11.main(Lucene11.java:55)

修改了分数功能:

package org.ninit.models.bm25;


import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;

public class BM25TermScorer extends Scorer {

    private TermQuery term;
    private IndexReader reader;
    private TermDocs termDocs;
    private float idf;
    private float av_length;
    private byte[] norm;
    private float b;
    private float k1;

    public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity)
            throws IOException {
        super(similarity);
        this.reader = reader;
        this.term = term;
        this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs());
        this.norm = this.reader.norms(this.term.getTerm().field());
        this.av_length = BM25Parameters.getAverageLength(this.term.getTerm().field());
        this.b = BM25Parameters.getB();
        this.k1 = BM25Parameters.getK1();
        this.termDocs = this.reader.termDocs(this.term.getTerm());

    }


    @Override
    public int doc() {
        return this.termDocs.doc();
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.lucene.search.Scorer#explain(int)
     */
    @Override
    public Explanation explain(int doc) throws IOException {
        // Init termDocs
        if (this.termDocs != null)
            this.termDocs.close();
        this.termDocs = this.reader.termDocs(this.term.getTerm());
        // skipTo doc

        //
        if (!this.skipTo(doc))
            return null;
        float length = 0f;
        byte[] norm = this.reader.norms(this.term.getTerm().field());

        float av_length = BM25Parameters.getAverageLength(this.term.getTerm().field());
        length = 1 / ((Similarity.decodeNorm(norm[this.doc()])) * (Similarity.decodeNorm(norm[this
                .doc()])));

        float tf = this.termDocs.freq();

        float result = BM25Parameters.getB() * (length / av_length);
        result = result + 1 - BM25Parameters.getB();
        result = tf / result;
        // FREQ SATURATION
        result = result / (result + BM25Parameters.getK1());

        Explanation idfE = new Explanation(this.idf, " idf (docFreq:"
                + this.reader.docFreq(this.term.getTerm()) + ",numDocs:" + this.reader.numDocs()
                + ")");
        Explanation bE = new Explanation(result, "B:" + BM25Parameters.getB() + ",Length:" + length
                + ",AvgLength:" + av_length + ",Freq:" + tf + ",K1:" + BM25Parameters.getK1());

        Explanation resultE = new Explanation(this.idf * result, "BM25("
                + this.term.getTerm().field() + ":" + this.term.getTerm().text());
        resultE.addDetail(idfE);
        resultE.addDetail(bE);

        return resultE;
    }

    @Override
    public boolean next() throws IOException {

        boolean result = this.termDocs.next();
        if (!result)
            this.termDocs.close();
        return result;

    }

    public float score()throws IOException{
        //IDF refers to the inverse document frequency (idf(qi,d)) and
        //TF25 refers to the second factor in the definition of the BM25 scoring function
        float TF25;
        float num25;
        float den25;
        float length;
        float norm = Similarity.decodeNorm(this.norm[this.doc()]);
        length = 1 / (norm * norm);
        den25= this.b*(length / this.av_length);
        den25= 1-this.b+den25;
        den25= this.k1*den25;
        den25= this.termDocs.freq()+den25;
        num25= this.k1+1;
        num25= num25*this.termDocs.freq();
        TF25= num25/den25;
        return TF25*this.idf;

        }//end of score

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.lucene.search.Scorer#skipTo(int)
     */
    @Override
    public boolean skipTo(int target) throws IOException {
        while (this.next() && this.doc() < target) {
        }

        return this.doc() == target;
    }

    public int freq() throws IOException {
        // TODO Auto-generated method stub
        return 0;
    }

    public int advance(int arg0) throws IOException {
        // TODO Auto-generated method stub
        return 0;
    }

    public long cost() {
        // TODO Auto-generated method stub
        return 0;
    }

    public int docID() {
        // TODO Auto-generated method stub
        return 0;
    }

    public int nextDoc() throws IOException {
        // TODO Auto-generated method stub
        return 0;
    }
}

错误是什么意思?

1 个答案:

答案 0 :(得分:1)

你真的无法混合和匹配lucene版本,你当然不能同时使用多个版本的Lucene核心。你说你现在在你的类路径中有Lucene核心2.4,2.9,3.0.3和4.5.1。那真的不行。看起来你想要使用Lucene 4.5.1版,所以摆脱其他三个lucene核心罐。您可能需要修改已使用旧版本复制的代码,以使其与4.X兼容(错误中显示的代码:不再有Version.LUCENE_CURRENT,您必须指定特定版本)。您可以找到有关3.6 - 4.0更改内容以及如何在migration guide中迁移的资源。

就BM25评分实施而言,现在(截至4.0)在lucene核心中实现了BM25Similarity