在ElasticSearch中防止某些字母字符的标记化

时间:2019-06-18 10:20:08

标签: elasticsearch tokenize elastic-stack

我想防止-/被用于特定字段的标记或词干。

我以为我有一些代码可以实现这种行为:

"char_filters": {
    "type": "word_delimiter",
    "type_table": [
        "- => ALPHA",
        "/ => ALPHA"
    ]
},

但是,它会出错:

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "Token filter [char_filters] cannot be used to parse synonyms"
      }
    ],
    "type": "illegal_argument_exception",
    "reason": "Token filter [char_filters] cannot be used to parse synonyms"
  },
  "status": 400
}

在线查看我发现PatternReplaceFilterFactory和其他一些方法,但是这些方法替代了字符。我希望解释器将两个字符作为字符串处理。

因此,我希望这样对字符串5/3mm进行标记。不分为53mm

请有人建议实现此目标的正确方法?这是一个简化的PUT和一些POST /分析请求。

// doc 1 contains what I would like to match
POST /products_example/_doc/1
{
  "ProductDescription_stripped":"RipCurl 5/3mm wetsuit omega",
  "ProductDescription_da_stripped": "RipCurl 5/3mm wetsuit omega"
}

// doc 2 contains only 3mm. Should be prioritised below 5/3mm (match 1)
POST /products_example/_doc/2
{
  "ProductDescription_stripped":"RipCurl 3mm wetsuit omega",
  "ProductDescription_da_stripped": "RipCurl 5/3mm wetsuit omega"
}

// here you can see 3mm have been tokenised where as 5/3mm should have been preserved
POST /products_example/_analyze
{
  "tokenizer": "standard",
  "filter":  [ "lowercase","asciifolding","synonym","stop","kstem"],
  "text":      "5/3mm ripcurl wetsuit omega"
}



PUT /products/
{

"settings": {
    "index.mapping.total_fields.limit": 1000000,
    "index.max_ngram_diff" : 2,

    "analysis": {

        "filter": {
            "char_filters": {
                "type": "word_delimiter",
                "type_table": [
                    "- => ALPHA",
                    "/ => ALPHA"
                ]
            },

            "description_stemmer_da" : {"type" : "stemmer","name" : "danish"},
            "stop_da" : {"type" : "stop","stopwords":  "_danish_"},
            "synonym" : {
                "type" : "synonym",
                "synonyms" : ["ripcurl, ripccurl => rip curl"]
            }
        },

        "tokenizer": {
            "ngram_tokenizer": {
                "type": "ngram", "min_gram": 3, "max_gram": 5,
                "token_chars": ["letter","digit"]
            }
        },

        "analyzer": {
            "description" : {
                "type": "custom", 
                "tokenizer": "standard",
                "filter": [
                    "char_filters",
                    "lowercase",
                    "asciifolding",
                    "synonym",
                    "stop",
                    "kstem"
                ]
            },

            "description_da": {
                "type":"custom", "tokenizer":"standard",
                "filter": [
                    "char_filters",
                    "lowercase",
                    "asciifolding",
                    "synonym",
                    "stop_da",
                    "description_stemmer_da"
                ]
            }
        }
    }
},

"mappings": {
    "properties": {

         "ProductDescription_stripped": {
            "type": "text",
            "analyzer" : "description"
        },
        "ProductDescription_da_stripped": {
            "type": "text",
            "analyzer": "danish"
        }
    }
}
}

0 个答案:

没有答案