弹性搜索跨分片

时间:2017-04-16 23:04:47

标签: elasticsearch lucene

我是ES的新手,我一直在研究ES中的得分,试图提高搜索结果的质量。我遇到过这样一种情况:queryNorm函数在分片上非常不同(大5倍)。我可以看到查询中术语对idf的依赖性,这可能在分片中有所不同。但是,在我的情况下,我有一个搜索词+跨越分片的idf度量彼此接近(绝对不足以导致X 5倍的差异)。我将简要介绍一下我的设置,包括我的查询和解释端点的结果。

设置 我有一个~6500个文档的索引,分布在5个分片中。我提到在下面的查询中出现的字段没有索引时间提升。我提到我的设置使用ES 2.4和#34; query_then_fetch"。我的问题:

{
  "query" : {
    "bool" : {
      "must" : [ {
        "bool" : {
          "must" : [ ],
          "must_not" : [ ],
          "should" : [ {
                "multi_match" : {
                  "query" : "pds",
                  "fields" : [ "field1" ],
                  "lenient" : true,
                  "fuzziness" : "0"
                }
          }, {
                "multi_match" : {
                  "query" : "pds",
                  "fields" : [ "field2" ],
                  "lenient" : true,
                  "fuzziness" : "0",
                  "boost" : 1000.0
                }
          }, {
                "multi_match" : {
                  "query" : "pds",
                  "fields" : [ "field3" ],
                  "lenient" : true,
                  "fuzziness" : "0",
                  "boost" : 500.0
                }
          }, {
                "multi_match" : {
                  "query" : "pds",
                  "fields" : [ "field4" ],
                  "lenient" : true,
                  "fuzziness" : "0",
                  "boost": 100.0
                }
          } ],
      "must_not" : [ ],
      "should" : [ ],
      "filter" : [ ]
    }
  },
  "size" : 1000,
  "min_score" : 0.0
}

解释其中2个文档的输出(其中一个文档的查询范围是另一个文档的5倍):

{
  "_shard" : 4,
  "_explanation" : {
    "value" : 2.046937,
    "description" : "product of:",
    "details" : [ {
      "value" : 4.093874,
      "description" : "sum of:",
      "details" : [ {
        "value" : 0.112607226,
        "description" : "weight(field1:pds in 93) [PerFieldSimilarity], result of:",
        "details" : [ {
          "value" : 0.112607226,
          "description" : "score(doc=93,freq=1.0), product of:",
          "details" : [ {
            "value" : 0.019996,
            "description" : "queryWeight, product of:",
            "details" : [ {
              "value" : 2.0,
              "description" : "boost",
              "details" : [ ]
            }, {
              "value" : 5.6314874,
              "description" : "idf(docFreq=11, maxDocs=1232)",
              "details" : [ ]
            }, {
              "value" : 0.0017753748,
              "description" : "queryNorm",
              "details" : [ ]
            } ]
          }, {
            "value" : 5.6314874,
            "description" : "fieldWeight in 93, product of:",
            "details" : [ {
              "value" : 1.0,
              "description" : "tf(freq=1.0), with freq of:",
              "details" : [ {
                "value" : 1.0,
                "description" : "termFreq=1.0",
                "details" : [ ]
              } ]
            }, {
              "value" : 5.6314874,
              "description" : "idf(docFreq=11, maxDocs=1232)",
              "details" : [ ]
            }, {
              "value" : 1.0,
              "description" : "fieldNorm(doc=93)",
              "details" : [ ]
            } ]
          } ]
        } ]
      }, {
        "value" : 3.9812667,
        "description" : "weight(field4:pds in 93) [PerFieldSimilarity], result of:",
        "details" : [ {
          "value" : 3.9812667,
          "description" : "score(doc=93,freq=2.0), product of:",
          "details" : [ {
            "value" : 0.9998001,
            "description" : "queryWeight, product of:",
            "details" : [ {
              "value" : 100.0,
              "description" : "boost",
              "details" : [ ]
            }, {
              "value" : 5.6314874,
              "description" : "idf(docFreq=11, maxDocs=1232)",
              "details" : [ ]
            }, {
              "value" : 0.0017753748,
              "description" : "queryNorm",
              "details" : [ ]
            } ]
          }, {
            "value" : 3.9820628,
            "description" : "fieldWeight in 93, product of:",
            "details" : [ {
              "value" : 1.4142135,
              "description" : "tf(freq=2.0), with freq of:",
              "details" : [ {
                "value" : 2.0,
                "description" : "termFreq=2.0",
                "details" : [ ]
              } ]
            }, {
              "value" : 5.6314874,
              "description" : "idf(docFreq=11, maxDocs=1232)",
              "details" : [ ]
            }, {
              "value" : 0.5,
              "description" : "fieldNorm(doc=93)",
              "details" : [ ]
            } ]
          } ]
        } ]
      } ]
    }, {
      "value" : 0.5,
      "description" : "coord(2/4)",
      "details" : [ ]
    } ]
  }
},
{
  "_shard" : 2,
  "_explanation" : {
    "value" : 0.4143453,
    "description" : "product of:",
    "details" : [ {
      "value" : 0.8286906,
      "description" : "sum of:",
      "details" : [ {
        "value" : 0.018336227,
        "description" : "weight(field1:pds in 58) [PerFieldSimilarity], result of:",
        "details" : [ {
          "value" : 0.018336227,
          "description" : "score(doc=58,freq=1.0), product of:",
          "details" : [ {
            "value" : 0.0030464241,
            "description" : "queryWeight, product of:",
            "details" : [ {
              "value" : 2.0,
              "description" : "boost",
              "details" : [ ]
            }, {
              "value" : 6.0189342,
              "description" : "idf(docFreq=11, maxDocs=1815)",
              "details" : [ ]
            }, {
              "value" : 2.5307006E-4,
              "description" : "queryNorm",
              "details" : [ ]
            } ]
          }, {
            "value" : 6.0189342,
            "description" : "fieldWeight in 58, product of:",
            "details" : [ {
              "value" : 1.0,
              "description" : "tf(freq=1.0), with freq of:",
              "details" : [ {
                "value" : 1.0,
                "description" : "termFreq=1.0",
                "details" : [ ]
              } ]
            }, {
              "value" : 6.0189342,
              "description" : "idf(docFreq=11, maxDocs=1815)",
              "details" : [ ]
            }, {
              "value" : 1.0,
              "description" : "fieldNorm(doc=58)",
              "details" : [ ]
            } ]
          } ]
        } ]
      }, {
        "value" : 0.81035435,
        "description" : "weight(field4:pds in 58) [PerFieldSimilarity], result of:",
        "details" : [ {
          "value" : 0.81035435,
          "description" : "score(doc=58,freq=2.0), product of:",
          "details" : [ {
            "value" : 0.1523212,
            "description" : "queryWeight, product of:",
            "details" : [ {
              "value" : 100.0,
              "description" : "boost",
              "details" : [ ]
            }, {
              "value" : 6.0189342,
              "description" : "idf(docFreq=11, maxDocs=1815)",
              "details" : [ ]
            }, {
              "value" : 2.5307006E-4,
              "description" : "queryNorm",
              "details" : [ ]
            } ]
          }, {
            "value" : 5.3200364,
            "description" : "fieldWeight in 58, product of:",
            "details" : [ {
              "value" : 1.4142135,
              "description" : "tf(freq=2.0), with freq of:",
              "details" : [ {
                "value" : 2.0,
                "description" : "termFreq=2.0",
                "details" : [ ]
              } ]
            }, {
              "value" : 6.0189342,
              "description" : "idf(docFreq=11, maxDocs=1815)",
              "details" : [ ]
            }, {
              "value" : 0.625,
              "description" : "fieldNorm(doc=58)",
              "details" : [ ]
            } ]
          } ]
        } ]
      } ]
    }, {
      "value" : 0.5,
      "description" : "coord(2/4)",
      "details" : [ ]
    } ]
  }
}

请注意,分片4中文档的queryNorm上的field1是" 0.0017753748" (使用idf 5.6314874),而碎片2中doc的相同字段的queryNorm是" 0.0002.5307006" (idf 6.0189342)。我尝试使用http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html上的公式手动跟踪queryNorm的计算,但未能达到相同的答案。

我还没有看到关于计算queryNorm的线索/帖子太多;我发现有用的是http://www.openjems.com/tag/querynorm/(这实际上是Solr,但由于查询是" query_then_fetch&#34 ;; Lucene计算应该是唯一重要的,所以我希望它们应该表现得相似)。但是,我无法使用相同的方法得出正确的queryNorm值(据我所知,在我的情况下,t.getBoost()应该为1,因为没有索引时间字段提升+没有特殊在上面的查询中字段提升)。

有没有人对这里发生的事情有什么建议?

1 个答案:

答案 0 :(得分:0)

您可以将search_type设置为dfs_query_then_fetch

{
    "search_type": "dfs_query_then_fetch",
    "query": {
        "bool": {
            "must": [
                {
                    "bool": {
                        "must": [],
                        "must_not": [],
                        "should": [
                            {
                                "multi_match": {
                                    "query": "pds",
                                    "fields": [
                                        "field1"
                                    ],
                                    "lenient": true,
                                    "fuzziness": "0"
                                }
                            },
                            {
                                "multi_match": {
                                    "query": "pds",
                                    "fields": [
                                        "field2"
                                    ],
                                    "lenient": true,
                                    "fuzziness": "0",
                                    "boost": 1000.0
                                }
                            }
                        ]
                    }
                },
                {
                    "multi_match": {
                        "query": "pds",
                        "fields": [
                            "field3"
                        ],
                        "lenient": true,
                        "fuzziness": "0",
                        "boost": 500.0
                    }
                },
                {
                    "multi_match": {
                        "query": "pds",
                        "fields": [
                            "field4"
                        ],
                        "lenient": true,
                        "fuzziness": "0",
                        "boost": 100.0
                    }
                }
            ],
            "must_not": [],
            "should": [],
            "filter": []
        }
    },
    "size": 1000,
    "min_score": 0.0
}

在这种情况下,所有标准值都是全局的。但它可能会影响查询性能。如果索引很小,您还可以使用单个分片创建索引。但是如果你有更多的文档,这些值应该是不同的。