我们正在使用python2.7,使用elasticsearch == 5.0.1客户端。 在运行我们的应用程序时,我们注意到内存增加了,过了一段时间我们就到达了OOM。
我使用memory_profiler进行调查,并注意到当我们尝试使用elasticsearch助手的批量函数时,内存会增加并且不会被释放。
我们尝试强制GC收集未使用的数据,我添加了time.sleep(我的第一个想法是elasticsearch仍在流式传输数据)。
其他人是否有弹性搜索批量问题并有解决方案?
这是我的代码:
from elasticsearch import Elasticsearch, serializer, compat, exceptions, RequestsHttpConnection
from elasticsearch_dsl import Search, Q, MultiSearch
class JSONSerializerPython2(serializer.JSONSerializer):
def dumps(self, data):
import json
# don't serialize strings
if isinstance(data, compat.string_types):
return data
try:
return json.dumps(data, default=self.default, ensure_ascii=True)
except (ValueError, TypeError) as e:
logger.error(e)
raise exceptions.SerializationError(data, e)
class ElasticSearchDriver(object):
def __init__(self, index_name, host, doc_type=None, max_size_connections=100):
"""
:param index_name: (String)
:param host: (String)
:param doc_type: (String)
:param max_size_connections: (Int)
"""
self.es = None
self.index_name = index_name
self.es_search = None
self.doc_type = doc_type
es_connection_dict = {"hosts": host, "serializer": JSONSerializerPython2(), "maxsize": max_size_connections,}
try:
# create the connection object
self.es = Elasticsearch(**es_connection_dict)
except Exception as msg:
logger.error(msg)
raise ValueError("Error: ElasticSearchDriver.__init__-{0}".format(msg))
@retry(stop_max_attempt_number=STOP_MAX_ATTEMPT_NUMBER, wait_exponential_multiplier=WAIT_EXPONENTIAL_MULTIPLIER,
wait_exponential_max=WAIT_EXPONENTIAL_MAX, retry_on_exception=exception_elasticsearch)
@profile
def upsert_data_bulk(self, key, data_json_list, inline_script, retry_on_conflict=5):
"""
:param key: (String)
:param data_json_list: (Dict)
:param inline_script: (String)
:param retry_on_conflict: (Boolean)
:return: answer (String/Dict)
"""
from datetime import datetime
actions = list()
for data_json in data_json_list:
data_dict = {
"_op_type": "update",
"_index": self.index_name,
"_type": key,
"retry_on_conflict": retry_on_conflict,
"_source": {
"script": {
"lang": "painless",
"inline": inline_script,
"params": data_json
},
"upsert": data_json
}
}
if "system_write_at" not in data_json:
system_write_at = datetime.utcnow()
data_dict["_source"]["script"]["params"]["system_write_at"] = system_write_at
data_dict["_source"]["upsert"]["system_write_at"] = system_write_at
if "es_id" in data_json:
data_dict["_id"] = data_json["es_id"]
actions.append(data_dict)
process = psutil.Process(os.getpid())
print "------------"
print process.memory_info()[0]
x = bulk(self.es, actions, request_timeout=120)
print process.memory_info()[0]
# time.sleep(120)
print process.memory_info()[0]
actions = None
x = None
gc.collect()
print process.memory_info()[0]
print "------------"
return x
打印的memory_info:
------------
122753024 . # before bulk
192577536 . # after bulk
192577536
192577536
------------
在第279行,我们可以看到批量添加67M到内存。 我们正在使用它吗? 我们真的被困在这里