我的团队正在尝试索引我们的商品信息,并需要对我到目前为止创建的内容进行健全性检查。以下是我们需要搜索的一些文本的示例:
AA4VG90EP4DM1 / 32R-NSF52F001DX-S与DAMAN MANIFOLD 0281CS0011 SIEMENS SPEC 74/07104909/10 REV L WOMACK SYSTEMS
正如您所看到的,英语单词和随机数字和字母混合在一起。在网上做了一些研究后,我决定使用单词分隔符和空白分词符。这是我目前使用的分析仪:
{
{
"itemindex": {
"settings": {
"index": {
"uuid": "1HxasKSCSW2iRHf6pYfkWw",
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"filter": [
"lowercase",
"my_word_delimiter"
],
"tokenizer": "whitespace"
}
},
"filter": {
"my_word_delimiter": {
"type_table": "/ => ALPHANUM",
"preserve_original": "true",
"catenate_words": "true",
"type": "word_delimiter"
}
}
},
"number_of_replicas": "1",
"number_of_shards": "5",
"version": {
"created": "1000099"
}
}
}
}
}
以下是来自analyze api的输出:
{
"tokens": [
{
"token": "aa4vg90ep4dm1/32r-nsf52f001dx-s",
"start_offset": 0,
"end_offset": 31,
"type": "word",
"position": 1
},
{
"token": "aa",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "4",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 2
},
{
"token": "vg",
"start_offset": 3,
"end_offset": 5,
"type": "word",
"position": 3
},
{
"token": "90",
"start_offset": 5,
"end_offset": 7,
"type": "word",
"position": 4
},
{
"token": "ep",
"start_offset": 7,
"end_offset": 9,
"type": "word",
"position": 5
},
{
"token": "4",
"start_offset": 9,
"end_offset": 10,
"type": "word",
"position": 6
},
{
"token": "dm",
"start_offset": 10,
"end_offset": 12,
"type": "word",
"position": 7
},
{
"token": "1/32",
"start_offset": 12,
"end_offset": 16,
"type": "word",
"position": 8
},
{
"token": "r",
"start_offset": 16,
"end_offset": 17,
"type": "word",
"position": 9
},
{
"token": "nsf",
"start_offset": 18,
"end_offset": 21,
"type": "word",
"position": 10
},
{
"token": "rnsf",
"start_offset": 16,
"end_offset": 21,
"type": "word",
"position": 10
},
{
"token": "52",
"start_offset": 21,
"end_offset": 23,
"type": "word",
"position": 11
},
{
"token": "f",
"start_offset": 23,
"end_offset": 24,
"type": "word",
"position": 12
},
{
"token": "001",
"start_offset": 24,
"end_offset": 27,
"type": "word",
"position": 13
},
{
"token": "dx",
"start_offset": 27,
"end_offset": 29,
"type": "word",
"position": 14
},
{
"token": "s",
"start_offset": 30,
"end_offset": 31,
"type": "word",
"position": 15
},
{
"token": "dxs",
"start_offset": 27,
"end_offset": 31,
"type": "word",
"position": 15
},
{
"token": "with",
"start_offset": 32,
"end_offset": 36,
"type": "word",
"position": 16
},
{
"token": "daman",
"start_offset": 37,
"end_offset": 42,
"type": "word",
"position": 17
},
{
"token": "manifold",
"start_offset": 43,
"end_offset": 51,
"type": "word",
"position": 18
},
{
"token": "0281cs0011",
"start_offset": 52,
"end_offset": 62,
"type": "word",
"position": 19
},
{
"token": "0281",
"start_offset": 52,
"end_offset": 56,
"type": "word",
"position": 19
},
{
"token": "cs",
"start_offset": 56,
"end_offset": 58,
"type": "word",
"position": 20
},
{
"token": "0011",
"start_offset": 58,
"end_offset": 62,
"type": "word",
"position": 21
},
{
"token": "siemens",
"start_offset": 63,
"end_offset": 70,
"type": "word",
"position": 22
},
{
"token": "spec",
"start_offset": 71,
"end_offset": 75,
"type": "word",
"position": 23
},
{
"token": "74/07104909/10",
"start_offset": 76,
"end_offset": 90,
"type": "word",
"position": 24
},
{
"token": "rev",
"start_offset": 91,
"end_offset": 94,
"type": "word",
"position": 25
},
{
"token": "l",
"start_offset": 95,
"end_offset": 96,
"type": "word",
"position": 26
},
{
"token": "womack",
"start_offset": 98,
"end_offset": 104,
"type": "word",
"position": 27
},
{
"token": "systems",
"start_offset": 105,
"end_offset": 112,
"type": "word",
"position": 28
}
]
}
最后,这是我正在使用的NEST查询:
var results = client.Search<SearchResult>(s => s.Index("itemindex")
.Query(q => q
.QueryString(qs=> qs
.OnFields(f=> f.Description, f=> f.VendorPartNumber, f=> f.ItemNumber)
.Operator(Operator.or)
.Query(query + "*")))
.SortDescending("_score")
.Highlight(h => h
.OnFields(f => f
.OnField(e => e.Description)
.BoundaryCharacters(" ,")
.PreTags("<b>")
.PostTags("</b>")))
.From(start)
.Size(size));