我从ES查询获得了不正确的聚合计数。我从ES文档中了解基数和术语聚合不准确,但我得到的是太多差异。
我的索引的映射是
{
"dynamic_templates": [{
"template_action": {
"mapping": {
"type": "string",
"index": "not_analyzed"
},
"match": "*",
"match_mapping_type": "*"
}
}],
"_parent": {
"type": "users"
},
"date_detection": False,
"properties": {
"traits": {
"type": "object"
},
"cl_utm_params": {
"type": "object"
},
"cl_other_params": {
"type": "object"
},
"cl_triggered_ts": {
"type": "date"
}
}
}
示例文档
{
"client_id": "cl58vivh8w7t",
"user_id": "CL.1122029143.1904488380.1218174474.2049762488",
"session_id": "CL.1886305621.906039613",
"source": "Google",
"action": "pageview",
"cl_triggered_ts": "2016-09-09T00:13:33.818Z",
"browser": "Microsoft Edge 13",
"platform": "Windows 10",
"screen_size": "1920 x 1080",
"device": "Desktop",
"ip_address": "98.236.246.165",
"country": "United States",
"city": "Weirton",
"postal_code": "26062",
"location": "40.4224, -80.5739",
"timezone": "America/New_York",
"state": "West Virginia",
"continent": "North America",
"isp": "Comcast Cable",
"browser_language": "",
"traits": {},
"cl_utm_params": {},
"cl_other_params": {}
}
从下面的查询我得到每个来源的唯一会话数和每个设备的唯一会话数使用存储桶和指标聚合来源
{
"query": {
"bool": {
"must": [
{"match": {"client_id": "cl58vivh8w7t"}}
]
}
},
"aggs": {
"top_source": {
"terms": {
"field": "source"
},
"aggs": {
"total_unique_sessions": {"cardinality": {"field": "session_id"}},
"per_device": {
"terms": {"field": "device"},
"aggs": {"device_session": {"cardinality": {"field": "session_id"}}}
}
}
}
},
"size": 0
}
作为参考,我在下面给了一个桶。由此,每个设备的会话值的总和应该等于total_unique_sessions值。
我怀疑我的查询或计算有问题吗?
{
"key": "www.google.com",
"doc_count": 68947,
"per_device": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Desktop",
"doc_count": 49254,
"device_session": {
"value": 2413
}
},
{
"key": "Mobile",
"doc_count": 16317,
"device_session": {
"value": 3222
}
},
{
"key": "Tablet",
"doc_count": 3343,
"device_session": {
"value": 636
}
},
{
"key": "TV",
"doc_count": 33,
"device_session": {
"value": 9
}
}
]
},
"total_unique_sessions": {
"value": 9058
}
}
答案 0 :(得分:0)
我发现你正在使用匹配查询。
通常我们会对聚合进行术语查询。我认为匹配导致了这个问题。
{
"query": {
"bool": {
"must": [
{"term": {"client_id": "cl58vivh8w7t"}}
]
}
},
"aggs": {
"top_source": {
"terms": {
"field": "source"
},
"aggs": {
"total_unique_sessions": {"cardinality": {"field": "session_id"}},
"per_device": {
"terms": {"field": "device"},
"aggs": {"device_session": {"cardinality": {"field": "session_id"}}}
}
}
}
},
"size": 0
}