Lucene,突出显示和NullPointerException

时间:2011-03-04 16:15:00

标签: lucene highlighting

我想突出一些结果。我在“内容”字段中索引文档的正文(文本),当我使用highlighter.getBestFragment(...)尝试使用高亮时,我得到一个NullPointerException。

但是,例如,当我尝试突出显示fileName时,它可以正常工作。 我知道,因为我只使用了一个带有fileReader或(ParsingReader)的字段,我的文本被标记化,这与文件名不同。

这是我的代码,请帮助我。

package xxxxxx;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.parser.ParsingReader;

public class Indexer {

    static long start = 0;

    public static void main(String[] args) throws Exception {
        System.out.println("l'index se trouve à " + args[0]);
        System.out.println("le dossier ou s'effectue l'indexation est :" + args[1]);
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
                    + " <index dir> <data dir>");
        }

        String indexDir = args[0];
        String dataDir = args[1];


        start = System.currentTimeMillis();
        Indexer indexer = new Indexer(indexDir);
        int numIndexed;
        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());


        } finally {

            indexer.close();
        }

        long end = System.currentTimeMillis();
        System.out.println("Indexing " + numIndexed + " files took "
                + (end - start) + " milliseconds");
    }
    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException, InterruptedException {
        Directory dir = FSDirectory.open(new File(indexDir));

        writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true,
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setUseCompoundFile(true);
    }

    public void close() throws IOException {
        writer.optimize();
        writer.close();
    }

    public int index(String dataDir, FileFilter filter) throws Exception {

        File[] files = new File(dataDir).listFiles();

        for (File f : files) {

            if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) {

                if (!(f.getCanonicalPath().endsWith("~"))) {
                    indexFile(f);
                }
            } else {
                index(f.toString(), filter);
            }
        }
        return writer.numDocs();
    }

    private static class TextFilesFilter implements FileFilter {

        public boolean accept(File path) {
            return true;
        }
    }

    protected Document getDocument(File f) throws Exception {
       // FileReader frf = new FileReader(f);
        Document doc = new Document();
        Reader reader = new ParsingReader(f);

        doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
        doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED ));
        doc.add(new Field("fullpath", f.getCanonicalPath(),Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
        return doc;
    }

    private void indexFile(File f) throws Exception {
        System.out.println("Indexing " + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);
        System.out.println(System.currentTimeMillis() - start);
    }
}

-------------------------------------------------------------------



    package xxxxxxxxxxxxxxxxxxxx;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {

    public static void main(String[] args) throws IllegalArgumentException,
            IOException, ParseException, InvalidTokenOffsetsException {
        System.out.println("endroit ou se situe l'index " + args[0]);
        System.out.println(args[1]);
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Searcher.class.getName()
                    + " <index dir> <query>");
        }

        String indexDir = args[0];
        String q = args[1];
        search(indexDir, q);
    }


    public static void search(String indexDir, String q) throws IOException, ParseException, InvalidTokenOffsetsException {
        Directory dir = FSDirectory.open(new File(indexDir));
        IndexSearcher indexSearcher = new IndexSearcher(dir);
        QueryParser parserC = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30));
  //      QueryParser parserN = new QueryParser(Version.LUCENE_30, "filename", new StandardAnalyzer(Version.LUCENE_30));
        QueryParser parserP = new QueryParser(Version.LUCENE_30, "fullpath", new StandardAnalyzer(Version.LUCENE_30));
        parserC.setDefaultOperator(QueryParser.Operator.OR);
    //    parserN.setDefaultOperator(QueryParser.Operator.OR);
        parserC.setPhraseSlop(10);
      //  parserN.setPhraseSlop(10);
        DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(6);

        Query query = new MultiFieldQueryParser(Version.LUCENE_30, new String[]{"contents", "filename"},
                new CustomAnalyzer()).parse(q);

        Query queryC = parserC.parse(q);
        //Query queryN = parserN.parse(q);
        dmq.add(queryC);
        //dmq.add(queryN);
        //     dmq.add(query)      ;
        QueryScorer scorer = new QueryScorer(dmq, "contents");
        Highlighter highlighter = new Highlighter(scorer);
        highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));


        System.out.println(query.toString());
        long start = System.currentTimeMillis();
        TopDocs hits = indexSearcher.search(dmq, 15);
        System.out.println(hits.totalHits);
        long end = System.currentTimeMillis();
        System.err.println("Found " + hits.totalHits
                + " document(s) (in " + (end - start)
                + " milliseconds) that matched query '"
                + q + "':");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {

            Document doc = indexSearcher.doc(scoreDoc.doc);
            System.out.print(scoreDoc.score);
            System.out.println(doc.get("fullpath"));

 String contents = doc.get("contents"); // I am pretty sure the mistake is here , contents is always Null
 //But what can I do to make this thing work ?
            TokenStream stream =
                    TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(),
                    scoreDoc.doc,
                    "contents",
                    doc,
                    new StandardAnalyzer(Version.LUCENE_30));
            String fragment =
                    highlighter.getBestFragment(stream, contents);
            System.out.println(fragment);
        }
        indexSearcher.close();
    }
}

----------------------------------------------------------------------

1 个答案:

答案 0 :(得分:0)

如果你想使用那个荧光笔,你需要存储它。存储“文件名”但不存在“内容”,这就是为什么你看到它们表现不同的原因:

    doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED ));