我想防止-
和/
被用于特定字段的标记或词干。
我以为我有一些代码可以实现这种行为:
"char_filters": {
"type": "word_delimiter",
"type_table": [
"- => ALPHA",
"/ => ALPHA"
]
},
但是,它会出错:
{
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "Token filter [char_filters] cannot be used to parse synonyms"
}
],
"type": "illegal_argument_exception",
"reason": "Token filter [char_filters] cannot be used to parse synonyms"
},
"status": 400
}
在线查看我发现PatternReplaceFilterFactory
和其他一些方法,但是这些方法替代了字符。我希望解释器将两个字符作为字符串处理。
因此,我希望这样对字符串5/3mm
进行标记。不分为5
和3mm
。
请有人建议实现此目标的正确方法?这是一个简化的PUT
和一些POST /分析请求。
// doc 1 contains what I would like to match
POST /products_example/_doc/1
{
"ProductDescription_stripped":"RipCurl 5/3mm wetsuit omega",
"ProductDescription_da_stripped": "RipCurl 5/3mm wetsuit omega"
}
// doc 2 contains only 3mm. Should be prioritised below 5/3mm (match 1)
POST /products_example/_doc/2
{
"ProductDescription_stripped":"RipCurl 3mm wetsuit omega",
"ProductDescription_da_stripped": "RipCurl 5/3mm wetsuit omega"
}
// here you can see 3mm have been tokenised where as 5/3mm should have been preserved
POST /products_example/_analyze
{
"tokenizer": "standard",
"filter": [ "lowercase","asciifolding","synonym","stop","kstem"],
"text": "5/3mm ripcurl wetsuit omega"
}
PUT /products/
{
"settings": {
"index.mapping.total_fields.limit": 1000000,
"index.max_ngram_diff" : 2,
"analysis": {
"filter": {
"char_filters": {
"type": "word_delimiter",
"type_table": [
"- => ALPHA",
"/ => ALPHA"
]
},
"description_stemmer_da" : {"type" : "stemmer","name" : "danish"},
"stop_da" : {"type" : "stop","stopwords": "_danish_"},
"synonym" : {
"type" : "synonym",
"synonyms" : ["ripcurl, ripccurl => rip curl"]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram", "min_gram": 3, "max_gram": 5,
"token_chars": ["letter","digit"]
}
},
"analyzer": {
"description" : {
"type": "custom",
"tokenizer": "standard",
"filter": [
"char_filters",
"lowercase",
"asciifolding",
"synonym",
"stop",
"kstem"
]
},
"description_da": {
"type":"custom", "tokenizer":"standard",
"filter": [
"char_filters",
"lowercase",
"asciifolding",
"synonym",
"stop_da",
"description_stemmer_da"
]
}
}
}
},
"mappings": {
"properties": {
"ProductDescription_stripped": {
"type": "text",
"analyzer" : "description"
},
"ProductDescription_da_stripped": {
"type": "text",
"analyzer": "danish"
}
}
}
}