ElasticSearch查询使用聚合执行不区分大小写的搜索以获取匹配的列计数

时间:2017-03-05 16:30:32

标签: hadoop elasticsearch

For Data Please refer Image
我们正在使用ElasticSearch 5.0.0。 如果有任何正则表达式或任何其他方式来执行不区分大小写的搜索,请告诉我们。 请在附件中的ElasticSearch中查找电影索引中的数据。

请查找聚合查询以查找与电影索引中的搜索字符串“电视剧”匹配的字段:

GET /movies/_search?pretty
{
  "size": 0,
  "_source": false,
  "query": {
    "query_string": {
      "analyze_wildcard": true,
      "query": "*drama*"
    }
  },
  "aggs": {
    "distinct_tables_1": {
      "terms": {
        "field": "_type"
      },
      "aggs": {
        "distinct_col_1": {
          "terms": {
            "field": "genres.keyword",
            "include" : ".*drama.*"
          }
        }
      }
    },
    "distinct_tables_2": {
      "terms": {
        "field": "_type"
      },
      "aggs": {
        "distinct_col_2": {
          "terms": {
            "field": "director.keyword",
            "include" : ".*drama.*"
          }
        }
      }
    },
     "distinct_tables_3": {
      "terms": {
        "field": "_type"
      },
      "aggs": {
        "distinct_col_3": {
          "terms": {
            "field": "theatre.keyword",
           "include" : ".*drama.*"
          }
        }
      }
    }
  }
}

我们得到以下回复:

{
  "took": 10,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 4,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "distinct_tables_1": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "movie_intrnl",
          "doc_count": 2,
          "distinct_col_1": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        },
        {
          "key": "movie_shows",
          "doc_count": 2,
          "distinct_col_1": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        }
      ]
    },
    "distinct_tables_2": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "movie_intrnl",
          "doc_count": 2,
          "distinct_col_2": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        },
        {
          "key": "movie_shows",
          "doc_count": 2,
          "distinct_col_2": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        }
      ]
    },
    "distinct_tables_3": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "movie_intrnl",
          "doc_count": 2,
          "distinct_col_3": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        },
        {
          "key": "movie_shows",
          "doc_count": 2,
          "distinct_col_3": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        }
      ]
    }
  }
}

从响应中可以看出,即使存在与搜索字符串“drama”匹配的文档,也没有响应中的匹配列值。在聚合中搜索正则表达式似乎区分大小写,因此不返回任何值。 我们使用此备用查询来查找与Drama匹配的单词以执行不区分大小写的搜索。但是,这只使用了部分单词。 rama。而不是Drama,最好不要执行不区分大小写的搜索。

GET /movies/_search?pretty
{
  "size": 0,
  "_source": false,
  "query": {
    "query_string": {
      "analyze_wildcard": true,
      "query": "*drama*"
    }
  },
  "aggs": {
    "distinct_tables_1": {
      "terms": {
        "field": "_type"
      },
      "aggs": {
        "distinct_col_1": {
          "terms": {
            "field": "genres.keyword",
            "include" : ".*rama.*"
          }
        }
      }
    },
    "distinct_tables_2": {
      "terms": {
        "field": "_type"
      },
      "aggs": {
        "distinct_col_2": {
          "terms": {
            "field": "director.keyword",
            "include" : ".*rama.*"
          }
        }
      }
    },
     "distinct_tables_3": {
      "terms": {
        "field": "_type"
      },
      "aggs": {
        "distinct_col_3": {
          "terms": {
            "field": "theatre.keyword",
            "include" : ".*rama.*"
          }
        }
      }
    }
  }
}

上面给出的查询的响应:

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 4,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "distinct_tables_1": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "movie_intrnl",
          "doc_count": 2,
          "distinct_col_1": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "BiographyDrama",
                "doc_count": 1
              },
              {
                "key": "Drama",
                "doc_count": 1
              }
            ]
          }
        },
        {
          "key": "movie_shows",
          "doc_count": 2,
          "distinct_col_1": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "BiographyDrama",
                "doc_count": 1
              },
              {
                "key": "Drama",
                "doc_count": 1
              }
            ]
          }
        }
      ]
    },
    "distinct_tables_2": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "movie_intrnl",
          "doc_count": 2,
          "distinct_col_2": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "Drama1",
                "doc_count": 1
              },
              {
                "key": "Drama4",
                "doc_count": 1
              }
            ]
          }
        },
        {
          "key": "movie_shows",
          "doc_count": 2,
          "distinct_col_2": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        }
      ]
    },
    "distinct_tables_3": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "movie_intrnl",
          "doc_count": 2,
          "distinct_col_3": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        },
        {
          "key": "movie_shows",
          "doc_count": 2,
          "distinct_col_3": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "Drama4",
                "doc_count": 1
              }
            ]
          }
        }
      ]
    }
  }
}

1 个答案:

答案 0 :(得分:0)

您需要的是您所在领域的分析仪。详细地说,您需要更新映射以使用使用小写标记器的分析器(例如Simple Analyzer)分析Genre字段,或者您可以为自己制作自定义分析器。

之后,您可以根据需要使用不区分大小写的方式搜索数据