我正在尝试获取给定文档中不同单词的频率。 这是代码快照
package Lucene;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
public class LuceneTest
public static void main(String[] args)
Terms terms=null;
// Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
// Code to create the index
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
IndexWriter w = new IndexWriter(index, config);
addDoc(w, "Lucene in Action", "193398817");
addDoc(w, "Lucene for Dummies", "55320055Z");
addDoc(w, "Managing Gigabytes", "55063554A");
addDoc(w, "The Art of Computer Science", "9900333X");
addDoc(w, "My name is teja", "12842d99");
addDoc(w, "Lucene demo by teja", "23k43413");
// Text to search
String querystr = args.length > 0 ? args[0] : "teja";
// The \"title\" arg specifies the default field to use when no field is explicitly specified in the query
Query q = new QueryParser(Version.LUCENE_44, "title", analyzer).parse(querystr);
// Searching code
int hitsPerPage = 10;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// Code to display the results of search
System.out.println("Found " + hits.length + " hits.");
for(int i=0;i<hits.length;++i)
int docId = hits[i].doc;
System.out.println("docId "+docId);
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("isbn") + d.get("title"));
terms = reader.getTermVector(docId, "title"); //get terms vectors for one document and one field
if (terms != null && terms.size() > 0) {
TermsEnum termsEnum = terms.iterator(null); // access the terms for this field
BytesRef term = null;
while ((term = termsEnum.next()) != null) {// explore the terms for this field
DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
int docIdEnum;
while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
System.out.println(term.utf8ToString()+" "+docIdEnum+" "+docsEnum.freq()); //get the term frequency in the document
// reader can only be closed when there is no need to access the documents any more
catch(Exception e)
private static void addDoc(IndexWriter w, String title, String isbn) throws IOException
Document doc = new Document();
// A text field will be tokenized
doc.add(new TextField("title", title, Field.Store.YES));
// We use a string field for isbn because we don\'t want it tokenized
doc.add(new StringField("isbn", isbn, Field.Store.YES));
Found 2 hits.
docId 4
1. 12842d99My name is teja
docId 5
2. 23k43413Lucene demo by teja
terms = reader.getTermVector(docId, "title");
