我应该使用什么Elasticsearch分析器来搜索混合英语单词,产品信息?

时间:2014-02-19 18:03:22

标签: elasticsearch nest

我的团队正在尝试索引我们的商品信息,并需要对我到目前为止创建的内容进行健全性检查。以下是我们需要搜索的一些文本的示例:

AA4VG90EP4DM1 / 32R-NSF52F001DX-S与DAMAN MANIFOLD 0281CS0011 SIEMENS SPEC 74/07104909/10 REV L WOMACK SYSTEMS

正如您所看到的,英语单词和随机数字和字母混合在一起。在网上做了一些研究后,我决定使用单词分隔符和空白分词符。这是我目前使用的分析仪:

 {
   {
   "itemindex": {
      "settings": {
         "index": {
            "uuid": "1HxasKSCSW2iRHf6pYfkWw",
            "analysis": {
               "analyzer": {
                  "my_analyzer": {
                     "type": "custom",
                     "filter": [
                        "lowercase",
                        "my_word_delimiter"
                     ],
                     "tokenizer": "whitespace"
                  }
               },
               "filter": {
                  "my_word_delimiter": {
                     "type_table": "/ => ALPHANUM",
                     "preserve_original": "true",
                     "catenate_words": "true",
                     "type": "word_delimiter"
                  }
               }
            },
            "number_of_replicas": "1",
            "number_of_shards": "5",
            "version": {
               "created": "1000099"
            }
         }
      }
   }
}

以下是来自analyze api的输出:

    {
   "tokens": [
      {
         "token": "aa4vg90ep4dm1/32r-nsf52f001dx-s",
         "start_offset": 0,
         "end_offset": 31,
         "type": "word",
         "position": 1
      },
      {
         "token": "aa",
         "start_offset": 0,
         "end_offset": 2,
         "type": "word",
         "position": 1
      },
      {
         "token": "4",
         "start_offset": 2,
         "end_offset": 3,
         "type": "word",
         "position": 2
      },
      {
         "token": "vg",
         "start_offset": 3,
         "end_offset": 5,
         "type": "word",
         "position": 3
      },
      {
         "token": "90",
         "start_offset": 5,
         "end_offset": 7,
         "type": "word",
         "position": 4
      },
      {
         "token": "ep",
         "start_offset": 7,
         "end_offset": 9,
         "type": "word",
         "position": 5
      },
      {
         "token": "4",
         "start_offset": 9,
         "end_offset": 10,
         "type": "word",
         "position": 6
      },
      {
         "token": "dm",
         "start_offset": 10,
         "end_offset": 12,
         "type": "word",
         "position": 7
      },
      {
         "token": "1/32",
         "start_offset": 12,
         "end_offset": 16,
         "type": "word",
         "position": 8
      },
      {
         "token": "r",
         "start_offset": 16,
         "end_offset": 17,
         "type": "word",
         "position": 9
      },
      {
         "token": "nsf",
         "start_offset": 18,
         "end_offset": 21,
         "type": "word",
         "position": 10
      },
      {
         "token": "rnsf",
         "start_offset": 16,
         "end_offset": 21,
         "type": "word",
         "position": 10
      },
      {
         "token": "52",
         "start_offset": 21,
         "end_offset": 23,
         "type": "word",
         "position": 11
      },
      {
         "token": "f",
         "start_offset": 23,
         "end_offset": 24,
         "type": "word",
         "position": 12
      },
      {
         "token": "001",
         "start_offset": 24,
         "end_offset": 27,
         "type": "word",
         "position": 13
      },
      {
         "token": "dx",
         "start_offset": 27,
         "end_offset": 29,
         "type": "word",
         "position": 14
      },
      {
         "token": "s",
         "start_offset": 30,
         "end_offset": 31,
         "type": "word",
         "position": 15
      },
      {
         "token": "dxs",
         "start_offset": 27,
         "end_offset": 31,
         "type": "word",
         "position": 15
      },
      {
         "token": "with",
         "start_offset": 32,
         "end_offset": 36,
         "type": "word",
         "position": 16
      },
      {
         "token": "daman",
         "start_offset": 37,
         "end_offset": 42,
         "type": "word",
         "position": 17
      },
      {
         "token": "manifold",
         "start_offset": 43,
         "end_offset": 51,
         "type": "word",
         "position": 18
      },
      {
         "token": "0281cs0011",
         "start_offset": 52,
         "end_offset": 62,
         "type": "word",
         "position": 19
      },
      {
         "token": "0281",
         "start_offset": 52,
         "end_offset": 56,
         "type": "word",
         "position": 19
      },
      {
         "token": "cs",
         "start_offset": 56,
         "end_offset": 58,
         "type": "word",
         "position": 20
      },
      {
         "token": "0011",
         "start_offset": 58,
         "end_offset": 62,
         "type": "word",
         "position": 21
      },
      {
         "token": "siemens",
         "start_offset": 63,
         "end_offset": 70,
         "type": "word",
         "position": 22
      },
      {
         "token": "spec",
         "start_offset": 71,
         "end_offset": 75,
         "type": "word",
         "position": 23
      },
      {
         "token": "74/07104909/10",
         "start_offset": 76,
         "end_offset": 90,
         "type": "word",
         "position": 24
      },
      {
         "token": "rev",
         "start_offset": 91,
         "end_offset": 94,
         "type": "word",
         "position": 25
      },
      {
         "token": "l",
         "start_offset": 95,
         "end_offset": 96,
         "type": "word",
         "position": 26
      },
      {
         "token": "womack",
         "start_offset": 98,
         "end_offset": 104,
         "type": "word",
         "position": 27
      },
      {
         "token": "systems",
         "start_offset": 105,
         "end_offset": 112,
         "type": "word",
         "position": 28
      }
   ]
}

最后,这是我正在使用的NEST查询:

var results = client.Search<SearchResult>(s => s.Index("itemindex")
            .Query(q => q
                .QueryString(qs=> qs
                    .OnFields(f=> f.Description, f=> f.VendorPartNumber, f=> f.ItemNumber)
                    .Operator(Operator.or)
                    .Query(query + "*")))
                    .SortDescending("_score")
                .Highlight(h => h
                    .OnFields(f => f
                    .OnField(e => e.Description)
                    .BoundaryCharacters(" ,")
                    .PreTags("<b>")
                    .PostTags("</b>")))
               .From(start)
               .Size(size));

0 个答案:

没有答案