从索引获取TF-IDF值

时间:2015-02-01 06:31:31

标签: java indexing lucene information-retrieval tf-idf

以下代码用于从索引获取tf-idf值。但是在运行它时遇到错误,在Correct_ME的行上。

使用Lucene 4.8。

DocIndexing.java

    public class DocIndexing {
    private DocIndexing() {}

  /** Index all text files under a directory.
     * @param args
     * @throws java.io.IOException */
  public static void main(String[] args) throws IOException {
   String usage = "java org.apache.lucene.demo.IndexFiles"
                 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
                 + "This indexes the documents in DOCS_PATH, creating a Lucene index"
                 + "in INDEX_PATH that can be searched with Searching";

    String indexPath = "C:/Users/dell/Documents/NetBeansProjects/IndexingSearching/Index";

    String docsPath = "C:/Users/dell/Documents/NetBeansProjects/IndexingSearching/ToBeIndexed"; 
    boolean create = true;
    for(int i=0;i<args.length;i++) {
      if (null != args[i]) switch (args[i]) {
           case "-index":
               indexPath = args[i+1];
               i++;
               break;
           case "-docs":
               docsPath = args[i+1];
               i++;
               break;
           case "-update":
               create = false;
               break;
       }
    }

    if (docsPath == null) {
      System.err.println("Usage: " + usage);
      System.exit(1);
    }

    final File docDir = new File(docsPath);
    if (!docDir.canRead() && !docDir.isDirectory() &&
            !docDir.isHidden() &&
            !docDir.exists()) {
      System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
      System.exit(1);
    }

    Date start = new Date();
    try {
      System.out.println("Indexing to directory '" + indexPath + "'...");

      Directory dir = FSDirectory.open(new File(indexPath));

       Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_48);
       //Filter filter = new PorterStemFilter();
       IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, analyzer);

      if (create) {

        iwc.setOpenMode(OpenMode.CREATE);
      } else {
        // Add new documents to an existing index:
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
      }


        try ( 
                IndexWriter writer = new IndexWriter(dir, iwc)) {

            indexDocs(writer, docDir);
        }

      Date end = new Date();
      System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
      System.out.println(" caught a " + e.getClass() +
       "\n with message: " + e.getMessage());
    }
     Tf_Idf tfidf = new Tf_Idf();


     String field = null,term = null;

     tfidf.scoreCalculator(field, term);




  }


   /*  
   * @param writer Writer to the index where the given file/dir info will be stored
   * @param file The file to index, or the directory to recurse into to find files to index
   * @throws IOException If there is a low-level I/O error
   */
  static void indexDocs(IndexWriter writer, File file)
    throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
      if (file.isDirectory()) {
        String[] files = file.list();
        // an IO error could occur
        if (files != null) {
          for (int i = 0; i < files.length; i++) {
            indexDocs(writer, new File(file, files[i]));
          }
        }
      } else {

        FileInputStream fis;
        try {
          fis = new FileInputStream(file);
        } catch (FileNotFoundException fnfe) {

          return;
        }


        try {
          // make a new, empty document
          Document doc = new Document();



         // Field termV = new LongField("termVector", file.g) 

          Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
          doc.add(pathField);
          Field modifiedField = new LongField("modified", file.lastModified(), Field.Store.NO);
          doc.add(modifiedField);
          Field titleField = new TextField("title", file.getName(), Field.Store.YES);
          doc.add(titleField);
          Field contentsField = new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)));
          doc.add(contentsField);
          //contentsField.setBoost((float)0.5);
          //titleField.setBoost((float)2.5);

          /* doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));
          doc.add(new TextField("title", file.getName(), Field.Store.YES));

          doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));
                  */
        //  StringField..setBoost(1.2F);

          if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
          } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.getPath()), doc);
         }

        } finally {

          fis.close();
        }
      }
    }

  }
}

Tf-idf.java

   public class Tf_Idf {
    static float tf = 1;
    static float idf = 0;
    private float tfidf_score;
    static float [] tfidf = null;

     IndexReader indexReader;


        public Tf_Idf() throws IOException {

        this.indexReader = DirectoryReader.open(FSDirectory.open(new File("C:/Users/dell/Documents/NetBeansProjects/IndexingSearching/Index")));

    }
    public void scoreCalculator (String field, String term) throws IOException 
    {


        TFIDFSimilarity  tfidfSIM = new DefaultSimilarity();

        Bits liveDocs = MultiFields.getLiveDocs(indexReader);
            TermsEnum termEnum = MultiFields.getTerms(indexReader, field).iterator(null);
        BytesRef bytesRef=null;
        while ((bytesRef = termEnum.next()) != null) {
            if(bytesRef.utf8ToString().trim().equals(term.trim())) {
                if(termEnum.seekExact(bytesRef)) {
                    idf = tfidfSIM.idf(termEnum.docFreq(), indexReader.numDocs());
                    DocsEnum docsEnum = termEnum.docs(liveDocs, null);
                    if(docsEnum != null) {
                        int doc=0;
                        while((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                            tf = tfidfSIM.tf(docsEnum.freq());
                            tfidf_score = tf * idf ;
                            System.out.println(" -tfidf_score-" + tfidf_score);
                        }
                    }
                }
            }

        }

    }

}

1 个答案:

答案 0 :(得分:0)

很明显,您传递给 MultiFields 方法 null IndexReader

IndexReader reader = null;
tfidf.scoreCalculator( reader, field,term);

你需要写这样的东西:

IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(PATH_TO_LUCENE_INDEX)));
tfidf.scoreCalculator( reader, field,term);

当然,您需要使用真实路径重新PATH_TO_LUCENE_INDEX

我看到的另一个问题是 - 您在Tf_Idf中打开IndexReader但未在任何地方使用它,可能是删除它或使用它的好主意,在{{ 1}}方法,例如

scoreCalculator

但是在此类的方法使用字段中, - tfidf.scoreCalculator(field,term); 而非this.indexReader您尝试传递方法indexReader

<强> UPD

scoreCalculator

在此代码中,您需要将&#34; Index&#34; 替换为您的Lucene索引的实际路径,例如 - public Tf_Idf() throws IOException { this.reader = DirectoryReader.open(FSDirectory.open(new File("Index"))); } /home/user/index或您拥有它的任何地方。