有没有办法从Lucene索引中获得字数统计?

时间:2019-07-03 21:14:46

标签: lucene word-count

我正在使用最新的Apache-Lucene版本8.1.1

有可能吗?如何获取存储在Lucene索引中的所有(非停用词)术语的单词计数?结果应该是:

term1 453443
term2 445484
term3 443333

我需要用Java或Scala编写,但是任何语言都可以说明它的API ...

1 个答案:

答案 0 :(得分:1)

您可以在下面找到示例实现。

请注意,count给出的是文档数,而不是出现的次数(nosetests --pdb个字数4,文档数3)。此外,停用词也不会省略。

lucene

这将输出:

import java.io.IOException;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.misc.HighFreqTerms;
import org.apache.lucene.misc.TermStats;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class LuceneTest2 {
    final static String index = "index";
    final static String field = "text";

    public static void index() {
        try {
            Directory dir = FSDirectory.open(Paths.get(index));
            Analyzer analyzer = new StandardAnalyzer();
            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

            iwc.setOpenMode(OpenMode.CREATE);

            IndexWriter writer = new IndexWriter(dir, iwc);

            String[] lines = {
                    "lucene java lucene mark",
                    "lucene",
                    "lucene an example",
                    "java python"
            };
            for (int i = 0; i < lines.length; i++) {
                String line = lines[i];
                Document doc = new Document();
                doc.add(new StringField("id", "" + i, Field.Store.YES));
                doc.add(new TextField(field, line.trim(), Field.Store.YES));
                writer.addDocument(doc);
            }

            System.out.println("indexed " + lines.length + " sentences");
            writer.close();
        } catch (IOException e) {
            System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
        }
    }

    public static void count() {
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
            int numTerms = 100;
            TermStats[] stats = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
            for (TermStats termStats : stats) {
                String termText = termStats.termtext.utf8ToString();
                System.out.println(termText + " " + termStats.docFreq);
            }
            reader.close();
        } catch (Exception e) {
            System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
        }
    }

    public static void main(String[] args) {
        index();
        count();
    }
}