在字符串中获取已找到的术语的Lucene术语向量

时间:2018-06-07 09:11:40

标签: lucene highlight term-vectors

我正在尝试突出显示字符串中的术语。我的代码沿着字符串搜索并在索引中查找等效术语。代码返回找到的术语确定。但是,我想将用户输入的原始字符串返回给用户,并突出显示找到的术语。我正在使用Lucene 4,因为这是我用来学习Lucene的书。我有一个可怜的尝试获得术语向量等但它遍及整个领域,我无法弄清楚如何获得找到的术语..这是我的代码:

public class TokenArrayTest {
private static final String INDEX_DIR = "C:/ontologies/Lucene/icnpIndex";
//private static  List<Float> levScore = new ArrayList<Float>();
//add key and value pairs of tokens to a map to send to a servlet. key 10,11,12 etc
    //private static HashMap<Integer, String> hashMap = new HashMap<Integer, String>();
private static List<String> tokens = new ArrayList<String>();

private static int totalResults=0;

public static void main(String[] pArgs) throws IOException, ParseException, InvalidTokenOffsetsException 
{   

    //counters which detect found term changes to advance the html table to the next cell
    int b=1;
    int c=1;
    String searchText="Mrs. smith has limited mobility and fell out of bed. She needs a feeding assessment. She complained of abdominal pains nuring the night. She woke with a headache and she is due for a shower this morning."; 

    //Get directory reference
    Directory dir = FSDirectory.open(new File(INDEX_DIR));

    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    //Create lucene searcher. It search over a single IndexReader.
    IndexSearcher searcher = new IndexSearcher(reader);

    //analyzer with the default stop words
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);

    TokenStream tokenStream  = analyzer.tokenStream(null, new StringReader(searchText));
    CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);

    //Query parser to be used for creating TermQuery
    QueryParser qp = new QueryParser(Version.LUCENE_40, "Preferred Term", analyzer);


   /*add all of the words to an array after they have passed through the analyzer.
    * The words are used one by one through the query method later on.
    */
    while (tokenStream.incrementToken()) { 
        tokens.add(termAttribute.toString());         
    }       

        //print the top half of the html page       
    System.out.print("<html>\r\n" + 
            "\r\n" + 
            "<head>\r\n" + 
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1252\">\r\n" + 
            "\r\n" + 
            "<title>ICNP results</title>\r\n" + 
            "</head>\r\n" + 
            "\r\n" + 
            "<body>\r\n" + 
            "\r\n" + 
            "<p>"+
            searchText+"<br>"+
            "<p>"+
            "<div align=\"center\">\r\n" + 
            "  <center>\r\n" + 
            "  <table border=\"1\" \r\n" + 
            "    <tr>\r\n" +
            "<td>\r\n"+

            "");


    //place each word from the previous array into the query       
    for(int n=0;n<tokens.size();++n) {

    //Create the query
    Query query = qp.parse(tokens.get(n));

    //Search the lucene documents for the hits
    TopDocs hits = searcher.search(query, 20);

  //Total found documents
    totalResults =totalResults+hits.totalHits;



    //print out the score for each searched term
    //for (ScoreDoc sd : hits.scoreDocs)
    //{
       //Document d = searcher.doc(sd.doc);

       // System.out.println("Score : " + sd.score);


   // }


    /** Highlighter Code Start ****/

    //Put a html code in here for each found term if need be
    Formatter formatter = new SimpleHTMLFormatter("", "");

    //Scores text fragments by the number of unique query terms found
    QueryScorer scorer = new QueryScorer(query);

    //used to markup highlighted terms found in the best sections of a text
    Highlighter highlighter = new Highlighter(formatter, scorer);

    //It breaks text up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 20);

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);


    //Iterate over found results
    for (int i = 0; i < hits.scoreDocs.length; i++)
    {

        int docid = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(docid);

        //Get stored text from found document
        String text = doc.get("Preferred Term");


        //a pitiful attempt to get term vectors and such like
        termsVector = reader.getTermVector(i, "Preferred Term");
        termsEnum = termsVector.iterator(termsEnum);
        while ( (term = termsEnum.next()) != null ) {
            val = term.utf8ToString();
            System.out.println("DocId: " + i);
            System.out.println("  term: " + val);

            System.out.println("  length: " + term.length);
            docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
            if (docsAndPositionsEnum.nextDoc() >= 0) {
                int freq = docsAndPositionsEnum.freq();
                System.out.println("  freq: " + docsAndPositionsEnum.freq());
                for (int j = 0; j < freq; j++) {
                    System.out.println("    [");
                    System.out.println("      position: " + docsAndPositionsEnum.nextPosition());
                    System.out.println("      offset start: " + docsAndPositionsEnum.startOffset());
                    System.out.println("      offset end: " + docsAndPositionsEnum.endOffset());
                    System.out.println("    ]");
                }
            }
        }

        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "Preferred Term", analyzer);


        //Get highlighted text fragments
        String[] frags = highlighter.getBestFragments(stream, text,20);


        for (String frag : frags)
        {


            //On the first pass  print this html out         

            if((c==1)&&(b!=c)) {
                System.out.println("<select>");
                c=b;
            }else if((b!=c)) {  //and every other time move to the next cell when b changes
                System.out.println("</select>"
                        + "</td><td>"
                        + "<select>");
                c=b;

            }

            System.out.println("<option value='"+frag+"'>"+frag+"</option>");


        }

    }

    b=b+1;


}
    dir.close();
    b=1;
    c=1;
    totalResults=0;

    //print the bottom half of the html page
    System.out.print("</select></td>\r\n" + 
            "    </tr>\r\n" + 
            "  </table>\r\n" + 
            "  </center>\r\n" + 
            "</div>\r\n" + 
            "\r\n" + 
            "</body>\r\n" + 
            "\r\n" + 
            "</html>\r\n" + 
            ""); 


    }
}

2 个答案:

答案 0 :(得分:1)

我不知道lucene v4是否可能,但对于Highlighter UnifiedHighlighter,使用更新的版本很容易。 有几个教程可以通过不同的方式实现文本突出显示(只是谷歌...):

如果您从一个新项目开始,我强烈建议您使用最新版本,即使您的图书是基于lucene v4。这本书很好地了解了lucene如何工作,但使用旧版本的库是一个即时技术部门,你可以在以后处理。除此之外,较新版本通常会提供您可能感兴趣的其他功能。

答案 1 :(得分:0)

对于未来的读者,这是我的普通旧Java方法(POJM),它打印出偏移量。

generatePreviewText(analyzer,searchText,tokens,frags);

public static void generatePreviewText(Analyzer analyzer, String inputText, List<String> tokens, String[] frags) throws IOException

{
  String contents[]= {inputText}; 
  String[] foundTerms = frags;

  //for(int n=0;n<frags.length;++n) {
      //System.out.println("Found terms array= "+foundTerms[n]);
 // }

Directory directory = new RAMDirectory();
IndexWriterConfig config =
        new IndexWriterConfig(Version.LUCENE_40, analyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);

FieldType textFieldType = new FieldType();
textFieldType.setIndexed(true);
textFieldType.setTokenized(true);
textFieldType.setStored(true);
textFieldType.setStoreTermVectors(true);
textFieldType.setStoreTermVectorPositions(true);
textFieldType.setStoreTermVectorOffsets(true);

Document doc = new Document();
Field textField = new Field("content", "", textFieldType);



for (String content : contents) {
    textField.setStringValue(content);
    doc.removeField("content");
    doc.add(textField);
    indexWriter.addDocument(doc);
}

indexWriter.commit();
IndexReader indexReader = DirectoryReader.open(directory);
DocsAndPositionsEnum docsAndPositionsEnum = null;
Terms termsVector = null;
TermsEnum termsEnum = null;
BytesRef term = null;
String val = null;

for (int i = 0; i < indexReader.maxDoc(); i++) {
    termsVector = indexReader.getTermVector(i, "content");
    termsEnum = termsVector.iterator(termsEnum);
    while ( (term = termsEnum.next()) != null ) {


        val = term.utf8ToString();

       // if(foundTerms.get(i)==val) {

        System.out.println("  term: " + val);

        System.out.println("  length: " + term.length);
        docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
        if (docsAndPositionsEnum.nextDoc() >= 0) {
            int freq = docsAndPositionsEnum.freq();
            System.out.println("  freq: " + docsAndPositionsEnum.freq());
            for (int j = 0; j < freq; j++) {
                System.out.println("    [");
                System.out.println("      position: " + docsAndPositionsEnum.nextPosition());
                System.out.println("      offset start: " + docsAndPositionsEnum.startOffset());
                System.out.println("      offset end: " + docsAndPositionsEnum.endOffset());

                System.out.println("    ]");
            }
        }

//}     }

}indexWriter.close();

}