让我先列出配置:
现在让我解释一下这种情况:
我有相当大的一组(准确地说是1639个).jpeg图像,我正在对其进行推断。目前,我使用的是最小批量大小为210的图像,即我将原始集合划分为210个图像的较小子集(对于最后一个最小批量来说,图像集的数量更少),并对它们进行推断。 同样,该推断是“整体推断”类型的。也就是说,我在每个小批量生产中使用3种模型进行推理,一一进行。
我面临的问题:
我ssh进入AWS控制台,然后以最小批量大小大于210运行代码,出现以下错误:
Using Amazon Elastic Inference Client Library Version: 1.2.12
Number of Elastic Inference Accelerators Available: 1
Elastic Inference Accelerator ID: eia-cb0a16c893874b43855dc0e0d2aacab7
Elastic Inference Accelerator Type: eia1.large
[Mon Apr 22 06:16:17 2019, 275419us] [Execution Engine][MXNet][6] Failed - Last Error:
EI Error Code: [51, 8, 31]
EI Error Description: Accelerator out of memory. Consider using a larger accelerator.
EI Request ID: MX-A19B0DE6-7999-4580-8C49-8EA7EBD5EACB -- EI Accelerator ID: eia-cb0a16c893874b43855dc0e0d2aacab7
EI Client Version: 1.2.12
Traceback (most recent call last):
File "Ensemble_Inferrer_2.py", line 65, in <module>
batched_prediction_list = inferrer.batch_infer('', current_image_list_batch, current_image_list_batch_index)
File "/home/ubuntu/Ensembled_Inference/Lib/Inferrer.py", line 78, in batch_infer
current_predictions = current_output.asnumpy().tolist()
File "/home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/ndarray/ndarray.py", line 1972, in asnumpy
ctypes.c_size_t(data.size)))
File "/home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/base.py", line 252, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [06:16:17] src/operator/subgraph/eia/eia_subgraph_op.cc:206: Last Error:
EI Error Code: [51, 8, 31]
EI Error Description: Accelerator out of memory. Consider using a larger accelerator.
EI Request ID: MX-A19B0DE6-7999-4580-8C49-8EA7EBD5EACB -- EI Accelerator ID: eia-cb0a16c893874b43855dc0e0d2aacab7
EI Client Version: 1.2.12
Stack trace returned 10 entries:
[bt] (0) /home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/libmxnet.so(dmlc::StackTrace()+0x44) [0x7f5658cae394]
[bt] (1) /home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x21) [0x7f5658cae771]
[bt] (2) /home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/libmxnet.so(mxnet::op::EiaSubgraphOperator::Forward(mxnet::OpContext const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&)+0x147) [0x7f5658cfe637]
[bt] (3) /home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/libmxnet.so(+0x2fc52f2) [0x7f565b6602f2]
[bt] (4) /home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext, mxnet::engine::OprBlock*)+0x564) [0x7f565b641a34]
[bt] (5) /home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/libmxnet.so(std::_Function_handler<void (std::shared_ptr<dmlc::ManualEvent>), mxnet::engine::ThreadedEnginePerDevice::PushToExecute(mxnet::engine::OprBlock*, bool)::{lambda()#1}::operator()() const::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data const&, std::shared_ptr<dmlc::ManualEvent>)+0x92) [0x7f565b645902]
[bt] (6) /home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/mxnet/libmxnet.so(std::thread::_Impl<std::_Bind_simple<std::function<void (std::shared_ptr<dmlc::ManualEvent>)> (std::shared_ptr<dmlc::ManualEvent>)> >::_M_run()+0x44) [0x7f565b642154]
[bt] (7) /home/ubuntu/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/scipy/sparse/../../../../libstdc++.so.6(+0xb86d4) [0x7f56627ce6d4]
[bt] (8) /lib/x86_64-linux-gnu/libpthread.so.0(+0x76ba) [0x7f5670b426ba]
[bt] (9) /lib/x86_64-linux-gnu/libc.so.6(clone+0x6d) [0x7f567016841d]
另外,当我将最小批处理大小保持较小时(例如120),但又产生了多个进程实例时,EI也将耗尽内存。
我很好奇是否有任何方法可以优化以下代码,以便我可以使用更大的minibatch大小或同时运行该进程的多个实例,而仍然不会耗尽EI的内存。
以下是我编写的代码:
Ensemble_Inferrer_2.py
import sys
import traceback
import os
import time
import logging
from logging.handlers import RotatingFileHandler
from scipy.stats import mode
from Lib.Inferrer import Inferrer
logger = logging.getLogger('')
logger.setLevel(logging.ERROR)
handler = RotatingFileHandler('Logs/error_log.log', maxBytes=100*1024*1024, backupCount=10)
formatter = logging.Formatter('%(asctime)-12s [%(levelname)s] %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
def handle_exception(exception_type, exception_value, exception_traceback):
logger.error(
'An internal error occurred. Exception message: {}', format(traceback.format_exception(exception_type,
exception_value,
exception_traceback
)
)
)
sys.exit(1)
output_directory = './Video_Frames'
models = [
'./Models/facenet1154o7.onnx',
'./Models/facenet1302u7.onnx',
'./Models/facenet10387.onnx'
]
emotion_labels = ['Anger', 'Confusion', 'Contempt', 'Disgust', 'Fear', 'Happiness', 'LookAway', 'Neutral', 'NoFace',
'Sadness', 'Surprise']
batch_size = 240
channel_count = 3
image_width = 227
image_height = 227
final_prediction_list = list()
inferrer = Inferrer(output_directory, models, emotion_labels, batch_size, channel_count, image_width, image_height,
False)
image_list = os.listdir(output_directory)
image_list.sort(key=lambda f: int(
''.join(filter(str.isdigit, str(f)))))
step_size = 0
if len(image_list) < batch_size:
step_size = len(image_list)
else:
step_size = batch_size
batched_image_list = [image_list[i:i + int(batch_size)] for i in range(0, len(image_list), step_size)]
started_at = time.time()
print('Process started at: ' + str(started_at))
for current_image_list_batch_index, current_image_list_batch in enumerate(batched_image_list):
batched_prediction_list = inferrer.batch_infer('', current_image_list_batch, current_image_list_batch_index)
for this_batched_prediction_list in batched_prediction_list:
final_prediction_list.append(mode(this_batched_prediction_list)[0][0])
print(final_prediction_list)
ended_at = time.time()
print('Process ended at: ' + str(ended_at))
print('Total time taken (in seconds): ' + str(ended_at - started_at))
Inferrer.py
import mxnet as mx
import mxnet.contrib.onnx as onnx_mxnet
import numpy as np
from collections import namedtuple
from PIL import Image
class Inferrer:
def __init__(self, output_directory, models, emotion_labels, batch_size, channel_count, image_width,
image_height, resize_required):
self.__output_directory = output_directory
self.__models = models
self.__emotion_labels = emotion_labels
self.__batch_size = batch_size
self.__channel_count = channel_count
self.__image_width = image_width
self.__image_height = image_height
self.__resize_required = resize_required
self.__batched_prediction_list = list()
'''Method to extract and process pixel data from the target image'''
def __get_image_datum(self, target_image_path):
image = Image.open(target_image_path)
if self.__resize_required is True:
image = image.resize((self.__image_width, self.__image_height))
image_data = np.asarray(image, dtype=np.float32)
image_data = np.ascontiguousarray(np.rollaxis(image_data, 2))
image_data = image_data[np.newaxis, :, :, :].astype(np.float32)
return image_data
'''Method to get processed pixel data for a batch of images'''
def __get_image_data(self, image_sub_directory, image_batch):
self.__batched_prediction_list = np.empty((len(image_batch), len(self.__models))).astype(np.str)
batched_image_data = np.zeros(
(len(image_batch), int(self.__channel_count), int(self.__image_width), int(self.__image_height)))
for current_image_index, current_image_path in enumerate(image_batch):
if '' == image_sub_directory:
batched_image_data[current_image_index, :, :, :] = self.__get_image_datum(
self.__output_directory + '/' + current_image_path)
else:
batched_image_data[current_image_index, :, :, :] = self.__get_image_datum(self.__output_directory + '/'
+ image_sub_directory + '/'
+ current_image_path)
return batched_image_data
'''Method to infer facial emotions from the target image'''
def batch_infer(self, image_sub_directory, image_batch, image_batch_index):
batched_image_data = self.__get_image_data(image_sub_directory, image_batch)
if 0 == image_batch_index:
# ctx = mx.cpu() # For local development
ctx = mx.eia()
for this_model_index, this_model in enumerate(self.__models):
if 0 == image_batch_index:
model_metadata = onnx_mxnet.get_model_metadata(this_model)
data_names = [inputs[0] for inputs in model_metadata.get('input_tensor_data')]
Batch = namedtuple('Batch', 'data')
sym, arg, aux = onnx_mxnet.import_model(this_model)
mod = mx.mod.Module(symbol=sym, data_names=data_names, context=ctx, label_names=None)
mod.bind(data_shapes=[(data_names[0], batched_image_data.shape)], label_shapes=None, for_training=False)
mod.set_params(arg_params=arg, aux_params=aux, allow_missing=True, allow_extra=True)
mod.forward(Batch([mx.nd.array(batched_image_data)]))
outputs = mod.get_outputs()[0]
for current_output_index, current_output in enumerate(outputs):
current_predictions = current_output.asnumpy().tolist()
zipb_object = zip(self.__emotion_labels, current_predictions)
current_prediction_dictionary = dict(zipb_object)
most_probable_prediction = max(current_prediction_dictionary, key=current_prediction_dictionary.get)
self.__batched_prediction_list[current_output_index, this_model_index] = most_probable_prediction
print(self.__batched_prediction_list)
return self.__batched_prediction_list