Question

我正在尝试获取给定文档中不同单词的频率。这是代码快照

package Lucene;

    import java.io.IOException;

    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.DocsEnum;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Terms;
    import org.apache.lucene.index.TermsEnum;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.DocIdSetIterator;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopScoreDocCollector;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.RAMDirectory;
    import org.apache.lucene.util.BytesRef;
    import org.apache.lucene.util.Version;


    public class LuceneTest 
    {
        public static void main(String[] args)
        {
             Terms terms=null;
            try
            {
                //  Specify the analyzer for tokenizing text.
                //  The same analyzer should be used for indexing and searching
                StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);

                //  Code to create the index
                Directory index = new RAMDirectory();

                IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer);

                IndexWriter w = new IndexWriter(index, config);
                addDoc(w, "Lucene in Action", "193398817");
                addDoc(w, "Lucene for Dummies", "55320055Z");
                addDoc(w, "Managing Gigabytes", "55063554A");
                addDoc(w, "The Art of Computer Science", "9900333X");
                addDoc(w, "My name is teja", "12842d99");
                addDoc(w, "Lucene demo by teja", "23k43413");
                w.close();

                //  Text to search
                String querystr = args.length > 0 ? args[0] : "teja";

                //  The \"title\" arg specifies the default field to use when no field is explicitly specified in the query
                Query q = new QueryParser(Version.LUCENE_44, "title", analyzer).parse(querystr);

                // Searching code
                int hitsPerPage = 10;
                IndexReader reader = DirectoryReader.open(index);
                IndexSearcher searcher = new IndexSearcher(reader);
                TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
                searcher.search(q, collector);
                ScoreDoc[] hits = collector.topDocs().scoreDocs;

                //  Code to display the results of search
                System.out.println("Found " + hits.length + " hits.");
                for(int i=0;i<hits.length;++i) 
                {
                  int docId = hits[i].doc;
                  System.out.println("docId  "+docId);
                  Document d = searcher.doc(docId);
                  System.out.println((i + 1) + ". " + d.get("isbn")  + d.get("title"));
                   terms = reader.getTermVector(docId, "title"); //get terms vectors for one document and one field
                    if (terms != null && terms.size() > 0) {
                        TermsEnum termsEnum = terms.iterator(null); // access the terms for this field
                        BytesRef term = null;
                        while ((term = termsEnum.next()) != null) {// explore the terms for this field
                            DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
                            int docIdEnum;
                            while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                              System.out.println(term.utf8ToString()+" "+docIdEnum+" "+docsEnum.freq()); //get the term frequency in the document

                            }
                        }
                    }





                }


                // reader can only be closed when there is no need to access the documents any more
                reader.close();
            }
            catch(Exception e)
            {
                System.out.println(e.getMessage());
            }
        }
        private static void addDoc(IndexWriter w, String title, String isbn) throws IOException 
        {
              Document doc = new Document();
              // A text field will be tokenized
              doc.add(new TextField("title", title, Field.Store.YES));
              // We use a string field for isbn because we don\'t want it tokenized
              doc.add(new StringField("isbn", isbn, Field.Store.YES));
              w.addDocument(doc);
        }
    }

但是当我正在运行时直接获得此输出

Found 2 hits.
docId  4
1. 12842d99My name is teja
docId  5
2. 23k43413Lucene demo by teja

但是在这一行

       terms = reader.getTermVector(docId, "title");

条款无效。任何人都可以帮助我使用Lucene4.4来帮助doc中每个单词的频率。

Answer 1

请参阅TextField documentation：

被索引和标记化的字段，没有术语向量。

如果您想存储TermVectors，可以使用Field，并使用所需的$('.progressvalue').each(function() { var $this = $(this); $this.css('width', $this.data('percent') + '%'); $this.parent().text('tournament progress: ' + $this.data('percent') + '%'); });。

频率术语在Lucene 4.4中无效

1 个答案: