Lucene查询已知语料库

时间:2013-12-10 21:58:52

标签: java lucene

我的情况是这样的:

我正在寻找一些医疗记录。作为改进我的搜索的一部分,我已经构建了一个包含10k个已知文档的索引。从理论上讲,如果我有正确的搜索,我应该能够遍历整个集合,搜索每个集合,并将其作为最高分文档(并且首先)。

除此之外,这不是我所看到的行为。我正在寻找的文件的3/10倍。有时甚至不在前5名。原因是这些文件都得分相同。我知道我找到了正确的文档,因为我给了他们一个不属于查询的唯一ID

鉴于我明确了我正在寻找的字段中的术语,并且我知道索引的确切内容,顶部文档应该是我寻求的(理论上)。

具体来说,我试图找到一个基于字段的相关文档:icdXcode,其中该字段是一个以空格分隔的值集。例如:

icd9Codes:“123456 938547 029381”

在现实世界中,我不会知道语料库,肯定会搜索不完整的代码列表。 问题:我需要在给定的代码字段中获取包含最匹配代码的文档如果有人可以提供一些非常棒的洞察力。谢谢stackcommunity

现在有些代码:

文件格式:

public Document getDocument() {
    final Document document = new Document();
    document.add(new StringField("gender", gender, Field.Store.YES));
    document.add(new StringField("ethnicity", ethnicity, Field.Store.YES));

    document.add(new IntField("claimId", claimId, Field.Store.YES));
    document.add(new IntField("age", age, Field.Store.YES));
    document.add(new IntField("zipcode", zipCode, Field.Store.YES));

    TextField i9codes = new TextField("icd9dxCodes", delimitedCollection(icd9dxCodes, " ").toString(), Field.Store.YES);
    document.add(i9codes);

    document.add(new TextField("icd9pcsCodes", delimitedCollection(icd9pcsCodes," ").toString(),Field.Store.YES));

    TextField i10Codes = new TextField("icd10dxCodes", delimitedCollection(icd10dxCodes, " ").toString(), Field.Store.YES);
    document.add(i10Codes);
    document.add(new TextField("icd10pcsCodes", delimitedCollection(icd10pcsCodes," ").toString(),Field.Store.YES));

    return document;
}

搜索代码:

private ConvertedDocument findDocument(Directory index, ConvertedDocument docToFind) throws IOException {
    final BooleanQuery bq = new BooleanQuery();
    ConvertedDocument resultDoc = null;
    Set<ConvertedDocument> debugList = new LinkedHashSet<>();

    for (String tag : docToFind.getIcd10dxCodes()) {
        TermQuery icd10dxCodes = new TermQuery(new Term("icd10dxCodes", tag));
        bq.add(icd10dxCodes, SHOULD);
    }

    bq.add(new TermQuery(new Term("gender", docToFind.getGender())), SHOULD);
    bq.add(newIntRange("age", docToFind.getAge(), docToFind.getAge(), true, true), SHOULD);
    bq.add(new TermQuery(new Term("ethnicity", docToFind.getGender())), SHOULD);
    bq.add(newIntRange("zipcode", docToFind.getZipCode(), docToFind.getZipCode(), true, true), SHOULD);

    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);

    TopScoreDocCollector collector = TopScoreDocCollector.create(10, true);
    searcher.search(bq, collector);
    TopDocs topDocs = collector.topDocs();
    ScoreDoc[] hits = topDocs.scoreDocs;

    Document document = reader.document(hits[0].doc);

    resultDoc = marshal(document);

    if (docToFind.getClaimId() != resultDoc.getClaimId()) {
        for (int i = 1; i < hits.length; i++)
            debugList.add(marshal(reader.document(hits[i].doc)));

        System.out.format("%b: %d -> %s\r\n",debugList.contains(docToFind), docToFind.getClaimId(),debugList );
    }


    return resultDoc;

}

2 个答案:

答案 0 :(得分:0)

似乎答案是多次添加字段,并将各个值构建为原始查询。以下是新的文档创建策略

public Document getDocument() {
    final Document document = new Document();
    document.add(new StringField("gender", gender, Field.Store.YES));
    document.add(new StringField("ethnicity", ethnicity, Field.Store.YES));

    document.add(new IntField("claimId", claimId, Field.Store.YES));
    document.add(new IntField("age", age, Field.Store.YES));
    document.add(new IntField("zipcode", zipCode, Field.Store.YES));

    for (String icd9dxCode : icd9dxCodes) {
        StringField icd9dxCode1 = new StringField("icd9dxCode", icd9dxCode, Field.Store.YES);
        document.add(icd9dxCode1);
    }
    document.add(new StringField("icd9pcsCode", delimitedCollection(icd9pcsCodes," ").toString(),Field.Store.YES));

    for (String icd10dxCode : icd10dxCodes) {

        document.add(new StringField("icd10dxCode", icd10dxCode,Field.Store.YES));
    }
    document.add(new StringField("icd10pcsCode", delimitedCollection(icd10pcsCodes," ").toString(),Field.Store.YES));

    return document;
}

这是PoC的一部分,所以请原谅总的垃圾代码:)

答案 1 :(得分:0)

我尝试接近相同版本的代码,当我查询它们排名正确时。

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class IndexData{

    public static void main(String argsp[]) throws Exception {
        Directory directory = new RAMDirectory();
        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46,new WhitespaceAnalyzer(Version.LUCENE_46));
        IndexWriter writer = new IndexWriter(directory,conf); 

        final Document doc1 = new Document();
        doc1.add(new StringField("gender", "M", Field.Store.YES));
        doc1.add(new StringField("ethnicity", "Asian", Field.Store.YES));
        //first doc
        doc1.add(new IntField("claimId", 100, Field.Store.YES));
        doc1.add(new IntField("age", 10, Field.Store.YES));
        doc1.add(new IntField("zipcode", 20148, Field.Store.YES));
        //add with all codes  - should be ranked 1
        TextField i10Codes1 = new TextField("icd10dxCodes", "123456 938547 029381", Field.Store.YES);
        doc1.add(i10Codes1);
        writer.addDocument(doc1);



        final Document doc2 = new Document();
        doc2.add(new StringField("gender", "M", Field.Store.YES));
        doc2.add(new StringField("ethnicity", "Asian", Field.Store.YES));
        //second doc
        doc2.add(new IntField("claimId", 101, Field.Store.YES));
        doc2.add(new IntField("age", 10, Field.Store.YES));
        doc2.add(new IntField("zipcode", 20148, Field.Store.YES));
        //mess the middle code with somethig different - should be ranked 2
        TextField i10Codes2 = new TextField("icd10dxCodes", "123456 000000 029381", Field.Store.YES);
        doc2.add(i10Codes2);
        writer.addDocument(doc2);

        final Document doc3 = new Document();
        doc3.add(new StringField("gender", "M", Field.Store.YES));
        doc3.add(new StringField("ethnicity", "Asian", Field.Store.YES));
        //third doc
        doc3.add(new IntField("claimId", 102, Field.Store.YES));
        doc3.add(new IntField("age", 10, Field.Store.YES));
        doc3.add(new IntField("zipcode", 20148, Field.Store.YES));
        //mess the first two with somethign different - should be ranked 3
        TextField i10Codes3 = new TextField("icd10dxCodes", "000000 000000 029381", Field.Store.YES);
        doc3.add(i10Codes3);
        writer.addDocument(doc3);



        writer.commit();
        writer.close();
        final BooleanQuery bq = new BooleanQuery();

        for (String tag : new String[]{"123456","938547","029381"}) {
            TermQuery icd10dxCodes = new TermQuery(new Term("icd10dxCodes", tag));
            bq.add(icd10dxCodes, Occur.SHOULD);
        }

        bq.add(new TermQuery(new Term("gender", "M")), Occur.SHOULD);

        IndexReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);

        TopScoreDocCollector collector = TopScoreDocCollector.create(10, true);
        searcher.search(bq, collector);
        TopDocs topDocs = collector.topDocs();
        ScoreDoc[] hits = topDocs.scoreDocs;
        System.out.println("hits :"+hits );
        //print results in score order
        for (int i = 0; i < hits.length; i++) {
            System.out.println("Hit["+i+"] :"+hits[i] );
            Document result = reader.document(hits[i].doc);
            System.out.println("Results :"+result.getField("claimId") + " => "+result);
        }

    }

}

这是输出:

hits :[Lorg.apache.lucene.search.ScoreDoc;@fbf107
Hit[0] :doc=0 score=1.125771 shardIndex=-1
Results :stored<claimId:100> => Document<stored,indexed,tokenized,omitNorms,indexOptions=DOCS_ONLY<gender:M> stored,indexed,tokenized,omitNorms,indexOptions=DOCS_ONLY<ethnicity:Asian> stored<claimId:100> stored<age:10> stored<zipcode:20148> stored,indexed,tokenized<icd10dxCodes:123456 938547 029381>>
Hit[1] :doc=1 score=0.47349548 shardIndex=-1
Results :stored<claimId:101> => Document<stored,indexed,tokenized,omitNorms,indexOptions=DOCS_ONLY<gender:M> stored,indexed,tokenized,omitNorms,indexOptions=DOCS_ONLY<ethnicity:Asian> stored<claimId:101> stored<age:10> stored<zipcode:20148> stored,indexed,tokenized<icd10dxCodes:123456 000000 029381>>
Hit[2] :doc=2 score=0.19050911 shardIndex=-1
Results :stored<claimId:102> => Document<stored,indexed,tokenized,omitNorms,indexOptions=DOCS_ONLY<gender:M> stored,indexed,tokenized,omitNorms,indexOptions=DOCS_ONLY<ethnicity:Asian> stored<claimId:102> stored<age:10> stored<zipcode:20148> stored,indexed,tokenized<icd10dxCodes:000000 000000 029381>>