Question

我有一个索引的以下映射和设置：

def init_index():
    ES_CLIENT.indices.create(
        index = "social_media",
        body = {
            "settings": {
                "index": {
                    "number_of_shards": 3,
                    "number_of_replicas": 0
                },
                "analysis": {
                    "analyzer": {
                        "my_english": {
                            "type": "standard",
                            "tokenizer": "whitespace",
                            "filter": [
                                "lowercase",
                                "asciifolding",
                                "cust_stop",
                                "my_snow"
                            ]
                        },
                        "my_english_shingle": {
                            "type": "standard",
                            "tokenizer": "whitespace",
                            "filter": [
                                "lowercase",
                                "asciifolding",
                                "cust_stop",
                                "my_snow",
                                "shingle_filter"
                            ]
                        }
                    },
                    "filter": {
                        "cust_stop": {
                            "type": "stop",
                            "stopwords_path": "stoplist.txt",
                        },
                        "shingle_filter" : {
                            "type" : "shingle",
                            "min_shingle_size" : 2,
                            "max_shingle_size" : 2,
                            "output_unigrams": True
                        },
                        "my_snow" : {
                            "type" : "snowball",
                            "language" : "English"
                        }
                    }
                }
            }
        }
    )

    press_mapping = {
        "tweet": {
            "dynamic": "strict",
            "properties": {
                "_id": {
                    "type": "string",
                    "store": True,
                    "index": "not_analyzed"
                },
                "text": {
                    "type": "multi_field",
                    "fields": {
                        "text": {
                            "include_in_all": False,
                            "type": "string",
                            "store": False,
                            "index": "not_analyzed"
                        },
                        "_analyzed": {
                            "type": "string",
                            "store": True,
                            "index": "analyzed",
                            "term_vector": "with_positions_offsets",
                            "analyzer": "my_english"
                        },
                        "_analyzed_shingles": {
                            "type": "string",
                            "store": True,
                            "index": "analyzed",
                            "term_vector": "with_positions_offsets",
                            "analyzer": "my_english_shingle"
                        }
                    }
                }
            }
        }
    }

    constants.ES_CLIENT.indices.put_mapping (
        index = "social_media",
        doc_type = "tweet",
        body = press_mapping
    )

我注意到除了lowercase之外没有其他过滤器工作。两个分析器的termvectors是相同的，因为shingle_filter也不起作用。

GET /social_media/_analyze?analyzer=my_english_shingle&text=complaining when应移除when，将complaining移至complain并返回一个小屋complain _，但它会给我：

{
   "tokens": [
      {
         "token": "complaining",
         "start_offset": 0,
         "end_offset": 11,
         "type": "<ALPHANUM>",
         "position": 1
      },
      {
         "token": "when",
         "start_offset": 12,
         "end_offset": 16,
         "type": "<ALPHANUM>",
         "position": 2
      }
   ]
}

可能是什么原因？

Answer 1

由于您要尝试定义新的custom analyzers而不是新的standard analyzers，因此您需要更改分析程序从standard到custom的映射类型。标准分析仪实际上没有采用您在映射中传递的任何设置 - 个人更喜欢ES在这种情况下抛出异常，而是他只是创建没有自定义字段的新标准分析器而忽略了其他所有内容传入（尝试从您的分析仪中删除lowercase并重新运行您的分析仪，输出仍然是小写的！）：

"analyzer": {
  "my_english": {
    "type": "custom", // <--- CUSTOM
    "tokenizer": "whitespace",
    "filter": [
      "lowercase",
      "asciifolding",
      "stop",
      "my_snow"
    ]
  },
  "my_english_shingle": {
    "type": "custom", // <--- CUSTOM
    "tokenizer": "whitespace",
    "filter": [
      "lowercase",
      "asciifolding",
      "stop",
      "my_snow",
      "shingle_filter"
    ]
}

使用此查询（我将查询和自定义停用词更改为仅stop，因为我没有您的文件）GET /social_media/_analyze?analyzer=my_english_shingle&text=COMPLAINING TEST返回：

{
   "tokens": [
      {
         "token": "complain",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "complain test",
         "start_offset": 0,
         "end_offset": 16,
         "type": "shingle",
         "position": 1
      },
      {
         "token": "test",
         "start_offset": 12,
         "end_offset": 16,
         "type": "word",
         "position": 2
      }
   ]
}

也不确定您的ES版本，但我要求布尔值true和false小写。

过滤器无法在Elasticsearch中运行

1 个答案: