PS:此查询适用于较小的数据集。问题开始于庞大的数据集。因此,尝试看看还有哪些其他可以使用的比例尺选项。
如果您需要在CPU,内存等方面拥有尽可能多的资源,那么在弹性搜索查询中需要使用聚合术语+ top_hits时,可以容纳的最大文档数量是多少? 我有一个超过一百万个文档的数据集。
我的数据集包含超过一百万个文档。当数据库中填充了大约6万多个文档时,它开始出现Elastic读取中的readTimeout错误。我的查询包含聚合词+ top_hits,因为我需要获取整个日期集中的唯一文档列表。 我尝试从查询中删除top_hits,但工作正常,但是由于我需要访问每个唯一文档的_source,因此除了top_hits之外,我不知道其他任何方式来获取它。
ES查询:
query = {
"size": 0,
"query": {
"bool": {
"must": [],
"filter": {
"range": {
"time": {
"lte": timestamp,
"format": "date_optional_time"
}
}
}
}
},
"aggs": {
"group_by": {
"terms": {
"size" : 450000,
"field": group_by_field
},
"aggs": {
"resource": {
"top_hits": {
"from": 0,
"size": 1,
"sort" : {
"time" : "desc"
},
"_source": {}
}
}
}
}
}
}
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib64/python3.7/http/client.py", line 1321, in getresponse
response.begin()
File "/usr/lib64/python3.7/http/client.py", line 296, in begin
version, status, reason = self._read_status()
File "/usr/lib64/python3.7/http/client.py", line 257, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib64/python3.7/socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "/usr/local/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py", line 298, in recv_into
raise timeout('The read operation timed out')
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/elasticsearch/connection/http_urllib3.py", line 172, in perform_request
response = self.pool.urlopen(method, url, body, retries=Retry(False), headers=request_headers, **kw)
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "/usr/local/lib/python3.7/site-packages/urllib3/util/retry.py", line 343, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.7/site-packages/urllib3/packages/six.py", line 686, in reraise
raise value
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 386, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 306, in _raise_timeout
raise ReadTimeoutError(self, url, "Read timed out. (read timeout=%s)" % timeout_value)
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='elasticsearch.service.apic.local', port=9200): Read timed out. (read timeout=60)
ConnectionTimeout caused by - ReadTimeoutError(HTTPSConnectionPool(host='elasticsearch.service.apic.local', port=9200): Read timed out. (read timeout=60))
Traceback (most recent call last):