ElasticSearch不一致的相关性

时间:2016-01-28 18:08:55

标签: elasticsearch

我正在使用elasticsearch来搜索播放其中的演员的电影。当我搜索例如“莱昂纳多·迪卡普里奥”有10部左右的电影让我回来,但他们都有不同的分数。由于他们都有相同的演员,我希望他们有相同的分数。是否有人能够阐明为什么会发生这种情况并希望如何阻止它呢?

Elasticsearch版本1.7.2

映射:

    {
  "programs": {
    "mappings": {
      "program_doc_type": {
        "properties": {
          "cast": {
            "type": "string",
            "analyzer": "keyword_analyzer",
            "fields": {
              "name": {
                "type": "string",
                "analyzer": "name_analyzer"
              }
            }
          },
          "django_id": {
            "type": "integer"
          },
          "has_poster": {
            "type": "boolean"
          },
          "imdb_id": {
            "type": "string",
            "index": "not_analyzed"
          },
          "kind": {
            "type": "string",
            "index": "not_analyzed"
          },
          "record_url_count": {
            "type": "integer"
          },
          "release_date": {
            "type": "date",
            "format": "dateOptionalTime"
          },
          "release_year": {
            "type": "integer"
          },
          "title": {
            "type": "string",
            "analyzer": "pattern"
          },
          "tms_id": {
            "type": "string",
            "index": "not_analyzed"
          }
        }
      }
    }
  }
}

分析仪:

"analysis": {
    "analyzer": {
        "keyword_analyzer": {
            "type": "custom",
                "filter": [
                    "lowercase"
                ],
                "tokenizer": "keyword"
        },
        "name_analyzer": {
            "type": "custom",
            "filter": [
                "lowercase"
            ],
            "tokenizer": "whitespace"
        }
    }
}

查询:

{
    "query": {
        "match": {"cast.name": "leonardo dicaprio"}
    }
}

首页结果:

    {
  "took": 12,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 62,
    "max_score": 12.046804,
    "hits": [
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "1077511",
        "_score": 12.046804,
        "_source": {
          "imdb_id": "tt4007278",
          "tms_id": "",
          "record_url_count": 0,
          "release_date": "2014-08-20",
          "title": "Carbon",
          "has_poster": false,
          "release_year": 2014,
          "django_id": 1077511,
          "kind": "movie",
          "cast": [
            "Leonardo DiCaprio"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "812919",
        "_score": 11.906615,
        "_source": {
          "imdb_id": "tt2076929",
          "tms_id": "",
          "record_url_count": 0,
          "title": "Satori",
          "has_poster": false,
          "release_year": 2014,
          "django_id": 812919,
          "kind": "N/A",
          "cast": [
            "Leonardo DiCaprio"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "376792",
        "_score": 11.886408,
        "_source": {
          "imdb_id": "tt0402538",
          "tms_id": "",
          "record_url_count": 0,
          "title": "Titanic: The Premiere",
          "has_poster": true,
          "release_year": 2000,
          "django_id": 376792,
          "kind": "movie",
          "cast": [
            "Leonardo DiCaprio"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "306106",
        "_score": 11.69776,
        "_source": {
          "imdb_id": "tt0325727",
          "tms_id": "",
          "record_url_count": 0,
          "release_date": "1998-08-16",
          "title": "Leo Mania",
          "has_poster": true,
          "release_year": 1998,
          "django_id": 306106,
          "kind": "movie",
          "cast": [
            "Leonardo DiCaprio"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "269743",
        "_score": 9.637444,
        "_source": {
          "imdb_id": "tt0286234",
          "tms_id": "",
          "record_url_count": 0,
          "title": "Total Eclipse",
          "has_poster": false,
          "release_year": 1995,
          "django_id": 269743,
          "kind": "movie",
          "cast": [
            "Leonardo DiCaprio",
            "Agnieszka Holland"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "840945",
        "_score": 9.358208,
        "_source": {
          "imdb_id": "tt2195237",
          "tms_id": "",
          "record_url_count": 0,
          "release_date": "2004-12-01",
          "title": "MovieReal: The Aviator",
          "has_poster": false,
          "release_year": 2004,
          "django_id": 840945,
          "kind": "series",
          "cast": [
            "Leonardo DiCaprio",
            "Martin Scorsese"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "382168",
        "_score": 9.358208,
        "_source": {
          "imdb_id": "tt0408269",
          "tms_id": "",
          "record_url_count": 0,
          "release_date": "1998-09-29",
          "title": "To Leo with Love",
          "has_poster": true,
          "release_year": 1998,
          "django_id": 382168,
          "kind": "movie",
          "cast": [
            "Jo Wyatt",
            "Leonardo DiCaprio"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "846212",
        "_score": 7.2280827,
        "_source": {
          "imdb_id": "tt2218442",
          "tms_id": "",
          "record_url_count": 0,
          "title": "Legacy of Secrecy",
          "has_poster": false,
          "release_year": 1947,
          "django_id": 846212,
          "kind": "N/A",
          "cast": [
            "Leonardo DiCaprio",
            "Robert De Niro",
            "D'Anthony Palms"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "595027",
        "_score": 7.1439695,
        "_source": {
          "imdb_id": "tt1294988",
          "tms_id": "",
          "record_url_count": 0,
          "release_date": "2006-09-27",
          "title": "Emporio Armani 'Red' One Night Only",
          "has_poster": false,
          "release_year": 2006,
          "django_id": 595027,
          "kind": "movie",
          "cast": [
            "Kim Cattrall",
            "Leonardo DiCaprio",
            "Beyoncé Knowles"
          ]
        }
      },
      {
        "_index": "programs",
        "_type": "program_doc_type",
        "_id": "752646",
        "_score": 7.1439695,
        "_source": {
          "imdb_id": "tt1826731",
          "tms_id": "",
          "record_url_count": 0,
          "release_date": "2009-06-02",
          "title": "Lives of Quiet Desperation: The Making of Revolutionary Road",
          "has_poster": false,
          "release_year": 2009,
          "django_id": 752646,
          "kind": "movie",
          "cast": [
            "Kathy Bates",
            "Leonardo DiCaprio",
            "Kate Winslet"
          ]
        }
      }
    ]
  }
}

更新:

我禁用了字段长度规范,这似乎已经改进了很多,但它们仍然不完全相同。我还是很困惑。根据我所读到的,有三种方法可以确定相关性:

  1. 学期频率
  2. 反向文档频率
  3. 字段长度范围(已禁用)
  4. 由于每个节目只有Leonardo Dicaprio有一次在我看来他们应该有相同的分数,但他们没有。也许我是误会。以下是禁用字段长度规范后的更新设置:

    映射:

    {
      "programs": {
        "mappings": {
          "program_doc_type": {
            "properties": {
              "cast": {
                "type": "string",
                "norms": {
                  "enabled": false
                },
                "analyzer": "keyword_analyzer",
                "fields": {
                  "name": {
                    "type": "string",
                    "norms": {
                      "enabled": false
                    },
                    "analyzer": "name_analyzer"
                  }
                }
              },
              "django_id": {
                "type": "integer"
              },
              "has_poster": {
                "type": "boolean"
              },
              "imdb_id": {
                "type": "string",
                "index": "not_analyzed"
              },
              "kind": {
                "type": "string",
                "index": "not_analyzed"
              },
              "record_url_count": {
                "type": "integer"
              },
              "release_date": {
                "type": "date",
                "format": "dateOptionalTime"
              },
              "release_year": {
                "type": "integer"
              },
              "title": {
                "type": "string",
                "analyzer": "pattern"
              },
              "tms_id": {
                "type": "string",
                "index": "not_analyzed"
              }
            }
          }
        }
      }
    }
    

    首页结果:

    {
      "took": 20,
      "timed_out": false,
      "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
      },
      "hits": {
        "total": 836,
        "max_score": 13.778852,
        "hits": [
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "421026",
            "_score": 13.778852,
            "_source": {
              "tms_id": "",
              "django_id": 421026,
              "imdb_id": "tt0449557",
              "has_poster": false,
              "release_date": "2005-05-24",
              "kind": "movie",
              "cast": [
                "Leonardo DiCaprio",
                "Jeffrey M. Schwartz",
                "Donald L. Barlett",
                "James B. Steele"
              ],
              "release_year": 2005,
              "record_url_count": 0,
              "title": "The Affliction of Howard Hughes: Obsessive-Compulsive Disorder"
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "555015",
            "_score": 13.778852,
            "_source": {
              "tms_id": "MV002510340000",
              "django_id": 555015,
              "imdb_id": "tt1130884",
              "has_poster": true,
              "release_date": "2010-02-19",
              "kind": "movie",
              "cast": [
                "Leonardo DiCaprio",
                "Mark Ruffalo",
                "Ben Kingsley",
                "Max von Sydow"
              ],
              "release_year": 2010,
              "record_url_count": 2,
              "title": "Shutter Island"
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "104669",
            "_score": 13.778852,
            "_source": {
              "tms_id": "",
              "django_id": 104669,
              "imdb_id": "tt0108330",
              "has_poster": true,
              "release_date": "1993-04-23",
              "kind": "movie",
              "cast": [
                "Robert De Niro",
                "Ellen Barkin",
                "Leonardo DiCaprio",
                "Jonah Blechman"
              ],
              "release_year": 1993,
              "record_url_count": 1,
              "title": "This Boy's Life"
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "846212",
            "_score": 13.778852,
            "_source": {
              "django_id": 846212,
              "title": "Legacy of Secrecy",
              "imdb_id": "tt2218442",
              "has_poster": false,
              "kind": "N/A",
              "cast": [
                "Leonardo DiCaprio",
                "Robert De Niro",
                "D'Anthony Palms"
              ],
              "release_year": 1947,
              "record_url_count": 0,
              "tms_id": ""
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "256632",
            "_score": 13.778852,
            "_source": {
              "django_id": 256632,
              "title": "The Movie Show",
              "imdb_id": "tt0271918",
              "has_poster": false,
              "kind": "series",
              "cast": [
                "Ray Brady",
                "Russell Crowe",
                "Larry Day",
                "Leonardo DiCaprio"
              ],
              "release_year": 1986,
              "record_url_count": 0,
              "tms_id": ""
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "269743",
            "_score": 13.778852,
            "_source": {
              "django_id": 269743,
              "title": "Total Eclipse",
              "imdb_id": "tt0286234",
              "has_poster": false,
              "kind": "movie",
              "cast": [
                "Leonardo DiCaprio",
                "Agnieszka Holland"
              ],
              "release_year": 1995,
              "record_url_count": 0,
              "tms_id": ""
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "1007190",
            "_score": 13.778852,
            "_source": {
              "tms_id": "",
              "django_id": 1007190,
              "imdb_id": "tt3391950",
              "has_poster": false,
              "release_date": "2013-12-29",
              "kind": "series",
              "cast": [
                "Leonardo DiCaprio",
                "Jonah Hill",
                "Martin Scorsese",
                "Terence Winter"
              ],
              "release_year": 2013,
              "record_url_count": 0,
              "title": "The Hollywood Reporter in Focus"
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "1077511",
            "_score": 13.778852,
            "_source": {
              "tms_id": "",
              "django_id": 1077511,
              "imdb_id": "tt4007278",
              "has_poster": false,
              "release_date": "2014-08-20",
              "kind": "movie",
              "cast": [
                "Leonardo DiCaprio"
              ],
              "release_year": 2014,
              "record_url_count": 0,
              "title": "Carbon"
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "302615",
            "_score": 13.57246,
            "_source": {
              "django_id": 302615,
              "title": "Directors: James Cameron",
              "imdb_id": "tt0322031",
              "has_poster": true,
              "kind": "movie",
              "cast": [
                "Michael Biehn",
                "James Cameron",
                "Jamie Lee Curtis",
                "Leonardo DiCaprio"
              ],
              "release_year": 1997,
              "record_url_count": 0,
              "tms_id": ""
            }
          },
          {
            "_index": "programs",
            "_type": "program_doc_type",
            "_id": "509785",
            "_score": 13.57246,
            "_source": {
              "tms_id": "",
              "django_id": 509785,
              "imdb_id": "tt0923573",
              "has_poster": false,
              "release_date": "2003-05-06",
              "kind": "movie",
              "cast": [
                "Frank Abagnale Jr.",
                "Amy Adams",
                "Nathalie Baye",
                "Leonardo DiCaprio"
              ],
              "release_year": 2003,
              "record_url_count": 0,
              "title": "'Catch Me If You Can': The Casting of the Film"
            }
          }
        ]
      }
    }
    

    结果得到了很大改善,但最后2个得分与其他结果不同。

1 个答案:

答案 0 :(得分:0)

Elasticsearch相关性默认模型称为TF / IDF。您可以阅读更多相关信息here 您在搜索匹配中看到的_score是由此模型计算的。

基本上,分数是对三个因素(more info here)的计算结果:

  1. 术语频率 - 术语在特定文档中出现的频率如何? TF
  2. 反向文档频率 - 该术语在集合中的所有文档中出现的频率是多少? IDF
  3. 场长规范 - 场地有多长?
  4. 正如您可以从上面推断的那样,因为包含leonardo dicaprio的每个文档的匹配术语数不同,字段长度和匹配术语在整个索引中计数,其相关性得分也不同。

    尽管如此,对于包含leonardo dicaprio的文档,您获得的分数高于不包含// define the function var toggleElements = function () { if($('#formtype').val() == "A") { $('#form1').show(); $('#form2').hide(); } else { $('#form1').hide(); $('#form2').show(); } }; // set the handler $('#formtype').on('change', toggleElements); // execute the function when the page loads $(document).ready(toggleElements); 的文档。

    希望它有所帮助。