我有一个冰冻的模型和4 gpus。我想对尽可能多的数据进行推断。我基本上想执行数据并行性,其中同一模型对4个批处理执行推理:每个gpu一批。
这就是我正在尝试做的事情
def return_ops():
# load the graph
with tf.Graph().as_default() as graph:
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(model_path, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
inputs = []
outputs = []
with graph.as_default() as g:
for gpu in ['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3']:
with tf.device(gpu):
image_tensor = g.get_tensor_by_name('input:0')
get_embeddings = g.get_tensor_by_name('embeddings:0')
inputs.append(image_tensor)
outputs.append(get_embeddings)
return inputs, outputs, g
但是,当我跑步时
#sample batch
x = np.ones((100,160,160,3))
# get ops
image_tensor_list, pt_list, emb_list, graph = return_ops()
# construct feed dict
feed_dict = {it: x for it in image_tensor_list}
# run the ops
with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
inf = sess.run(emb_list, feed_dict=feed_dict)
使用nvidia-smi进行检查时,一切都在/gpu:0
上运行。
但是我可以运行
with tf.device("/gpu:1"):
t = tf.range(1000)
with tf.Session() as sess:
sess.run(t)
第二个GPU上有活动...
如何正确执行此数据并行性任务?
答案 0 :(得分:0)
我了解到,导入graph_def时需要在GPU上放置张量。下面的代码返回ops,然后我可以使用sess.run([output1, ..., outputk], feed_dict)
运行它们。它将所有操作置于gpu上,这并不理想,因此我将allow_soft_placement
传递给会话配置为true。
class MultiGPUNet(object):
def __init__(self, model_path, n_gpu):
self.model_path = model_path
self.n_gpu = n_gpu
self.graph = tf.Graph()
# specify device for n_gpu copies of model
# during graphdef parsing
for i in range(self.n_gpu):
self._init_models(i, self.graph)
def _init_models(self, i, graph):
with self.graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(model_path, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
with tf.device('/device:GPU:{}'.format(i)):
tf.import_graph_def(od_graph_def, name='{}'.format(i))
def get_tensors(self):
output_tensors = []
input_tensors = []
train_tensors = []
for i in range(self.n_gpu):
input_tensors.append(
self.graph.get_tensor_by_name('{}/<input_name>:0'.format(i)))
output_tensors.append(
self.graph.get_tensor_by_name('{}/<out_name>:0'.format(i)))
train_tensors.append(
self.graph.get_tensor_by_name('{}/<train_name>:0'.format(i)))
def make_feed_dict(x):
"""x will be a list of batches"""
assert len(x)==len(input_tensors)
input_data = zip(input_tensors, x)
train_bool = zip(train_tensors, [False]*len(train_tensors))
return dict(input_data + train_bool)
return output_tensors, make_feed_dict