有关Elasticsearch搜索结果相关性的问题

时间:2017-04-10 20:03:19

标签: search elasticsearch

我正在尝试使用Elasticsearch for Chinese实现一个简单的演示。 但是搜索结果的相关性存在一些问题。

我使用映射创建了一个新索引:

{
    "tag": {
        "mappings": {
            "tag": {
                "properties": {
                    "name": {
                        "type": "text",
                        "analyzer": "standard"
                    },
                    "note": {
                        "type": "text",
                        "analyzer": "standard"
                    },
                    "status": {
                        "type": "integer"
                    },
                    "synonyms": {
                        "type": "text",
                        "analyzer": "standard"
                    }
                }
            }
        }
    }
}

请求正文和查询"美国":

{
    "query" : {
         "bool" : {
             "must" : {
                 "multi_match" : {
                     "query" : "美国",
                     "fields" : [ "name", "synonyms" ]
                 }
             },
             "filter" : {
                 "term" : {
                     "status" : 2
                 }
             }
         }
     }
 }

有两个记录"中国"和"美国"匹配查询。但记录"中国"获得了更高的分数。响应JSON如下:

{
    "took": 2,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
    },
    "hits": {
        "total": 2,
        "max_score": 0.7373906,
        "hits": [ {
            "_index": "tag",
            "_type": "tag",
            "_id": "5482361185636870",
            "_score": 0.7373906,
            "_source": {
                "status": 2,
                "name": "中国",
                "note": "",
                "synonyms": []
            }
        }, {
            "_index": "tag",
            "_type": "tag",
            "_id": "5474649504748034",
            "_score": 0.53484553,
            "_source": {
                "status": 2,
                "name": "美国",
                "note": "",
                "synonyms": []
            }
        } ]
    }
}

"中国"的记录获得了0.7373906,但记录了#34;美国"只有0.53484553。

结果解释:

{
  "hits": [
    {
      "_shard": "[tag][0]",
      "_node": "Wh9qH0bcTAaVNrsP1Aiyxg",
      "_index": "tag",
      "_type": "tag",
      "_id": "5482361185636870",
      "_score": 0.7373906,
      "_source": {
        "status": 2,
        "name": "中国",
        "note": "",
        "synonyms": []
      },
      "_explanation": {
        "value": 0.73739064,
        "description": "sum of:",
        "details": [
          {
            "value": 0.73739064,
            "description": "sum of:",
            "details": [
              {
                "value": 0.73739064,
                "description": "max of:",
                "details": [
                  {
                    "value": 0.73739064,
                    "description": "sum of:",
                    "details": [
                      {
                        "value": 0.73739064,
                        "description": "weight(name:国 in 0) [PerFieldSimilarity], result of:",
                        "details": [
                          {
                            "value": 0.73739064,
                            "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                            "details": [
                              {
                                "value": 0.6931472,
                                "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "docFreq",
                                    "details": []
                                  },
                                  {
                                    "value": 2,
                                    "description": "docCount",
                                    "details": []
                                  }
                                ]
                              },
                              {
                                "value": 1.0638298,
                                "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "termFreq=1.0",
                                    "details": []
                                  },
                                  {
                                    "value": 1.2,
                                    "description": "parameter k1",
                                    "details": []
                                  },
                                  {
                                    "value": 0.75,
                                    "description": "parameter b",
                                    "details": []
                                  },
                                  {
                                    "value": 3,
                                    "description": "avgFieldLength",
                                    "details": []
                                  },
                                  {
                                    "value": 2.56,
                                    "description": "fieldLength",
                                    "details": []
                                  }
                                ]
                              }
                            ]
                          }
                        ]
                      }
                    ]
                  }
                ]
              },
              {
                "value": 0,
                "description": "match on required clause, product of:",
                "details": [
                  {
                    "value": 0,
                    "description": "# clause",
                    "details": []
                  },
                  {
                    "value": 1,
                    "description": "status:[2 TO 2], product of:",
                    "details": [
                      {
                        "value": 1,
                        "description": "boost",
                        "details": []
                      },
                      {
                        "value": 1,
                        "description": "queryNorm",
                        "details": []
                      }
                    ]
                  }
                ]
              }
            ]
          },
          {
            "value": 0,
            "description": "match on required clause, product of:",
            "details": [
              {
                "value": 0,
                "description": "# clause",
                "details": []
              },
              {
                "value": 1,
                "description": "*:*, product of:",
                "details": [
                  {
                    "value": 1,
                    "description": "boost",
                    "details": []
                  },
                  {
                    "value": 1,
                    "description": "queryNorm",
                    "details": []
                  }
                ]
              }
            ]
          }
        ]
      }
    },
    {
      "_shard": "[tag][4]",
      "_node": "Wh9qH0bcTAaVNrsP1Aiyxg",
      "_index": "tag",
      "_type": "tag",
      "_id": "5474649504748034",
      "_score": 0.51623213,
      "_source": {
        "status": 2,
        "name": "美国",
        "note": "",
        "synonyms": []
      },
      "_explanation": {
        "value": 0.51623213,
        "description": "sum of:",
        "details": [
          {
            "value": 0.51623213,
            "description": "sum of:",
            "details": [
              {
                "value": 0.51623213,
                "description": "max of:",
                "details": [
                  {
                    "value": 0.51623213,
                    "description": "sum of:",
                    "details": [
                      {
                        "value": 0.25811607,
                        "description": "weight(name:美 in 0) [PerFieldSimilarity], result of:",
                        "details": [
                          {
                            "value": 0.25811607,
                            "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                            "details": [
                              {
                                "value": 0.2876821,
                                "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "docFreq",
                                    "details": []
                                  },
                                  {
                                    "value": 1,
                                    "description": "docCount",
                                    "details": []
                                  }
                                ]
                              },
                              {
                                "value": 0.89722675,
                                "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "termFreq=1.0",
                                    "details": []
                                  },
                                  {
                                    "value": 1.2,
                                    "description": "parameter k1",
                                    "details": []
                                  },
                                  {
                                    "value": 0.75,
                                    "description": "parameter b",
                                    "details": []
                                  },
                                  {
                                    "value": 2,
                                    "description": "avgFieldLength",
                                    "details": []
                                  },
                                  {
                                    "value": 2.56,
                                    "description": "fieldLength",
                                    "details": []
                                  }
                                ]
                              }
                            ]
                          }
                        ]
                      },
                      {
                        "value": 0.25811607,
                        "description": "weight(name:国 in 0) [PerFieldSimilarity], result of:",
                        "details": [
                          {
                            "value": 0.25811607,
                            "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                            "details": [
                              {
                                "value": 0.2876821,
                                "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "docFreq",
                                    "details": []
                                  },
                                  {
                                    "value": 1,
                                    "description": "docCount",
                                    "details": []
                                  }
                                ]
                              },
                              {
                                "value": 0.89722675,
                                "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "termFreq=1.0",
                                    "details": []
                                  },
                                  {
                                    "value": 1.2,
                                    "description": "parameter k1",
                                    "details": []
                                  },
                                  {
                                    "value": 0.75,
                                    "description": "parameter b",
                                    "details": []
                                  },
                                  {
                                    "value": 2,
                                    "description": "avgFieldLength",
                                    "details": []
                                  },
                                  {
                                    "value": 2.56,
                                    "description": "fieldLength",
                                    "details": []
                                  }
                                ]
                              }
                            ]
                          }
                        ]
                      }
                    ]
                  }
                ]
              },
              {
                "value": 0,
                "description": "match on required clause, product of:",
                "details": [
                  {
                    "value": 0,
                    "description": "# clause",
                    "details": []
                  },
                  {
                    "value": 1,
                    "description": "status:[2 TO 2], product of:",
                    "details": [
                      {
                        "value": 1,
                        "description": "boost",
                        "details": []
                      },
                      {
                        "value": 1,
                        "description": "queryNorm",
                        "details": []
                      }
                    ]
                  }
                ]
              }
            ]
          },
          {
            "value": 0,
            "description": "match on required clause, product of:",
            "details": [
              {
                "value": 0,
                "description": "# clause",
                "details": []
              },
              {
                "value": 1,
                "description": "*:*, product of:",
                "details": [
                  {
                    "value": 1,
                    "description": "boost",
                    "details": []
                  },
                  {
                    "value": 1,
                    "description": "queryNorm",
                    "details": []
                  }
                ]
              }
            ]
          }
        ]
      }
    }
  ]
}

1 个答案:

答案 0 :(得分:3)

您的索引似乎只包含少量文档,但它们属于不同的分片。每个shrad都有自己的术语频率。默认情况下,ElasticSearch使用这些本地值。但是您可以通过指定search_type=dfs_query_then_fetch querystring参数或添加相应的正文字段来更改此行为

{
    "search_type": "dfs_query_then_fetch",
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": "美国",
                    "fields": [
                        "name",
                        "synonyms"
                    ]
                }
            },
            "filter": {
                "term": {
                    "status": 2
                }
            }
        }
    }
}

看一下这篇文章https://www.elastic.co/blog/understanding-query-then-fetch-vs-dfs-query-then-fetch