在Google Cloud AI平台上部署更大的(rnn,embedded)网络:内存不足

时间:2019-05-28 08:12:37

标签: tensorflow google-cloud-platform google-cloud-ml

我正在尝试通过TF部署更大的Google AI Platform模型。 训练并保存为1.12。

最初,我的模型格式为

96B May 16 15:45 assets
142K May 16 15:45 saved_model.pb
160B May 17 12:14 variables

./assets:
8.1M May 16 15:45 tokens.txt

./variables:
8B May 16 15:45 variables.data-00000-of-00002
951M May 16 15:45 variables.data-00001-of-00002
559B May 16 15:45 variables.index

这太大(限制为256 Mb)。因此,然后我使用下面的脚本执行了一些痛苦的过程,冻结,优化和重新保存。

"""
A set of functions that freeze,optimise and re-save and TF model.
Purely to decrease the model size for Google Cloud.

Model trained with: TF 1.12.0
This script can only be run with: TF 1.14.1-dev20190516 (a nightly)


mkdir -p frozen/1
mkdir -p optimised/1
mkdir -p final_model/1

Python reduce_model_size.py
"""

import os
import numpy as np
from datetime import datetime
import sys


import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.tools import freeze_graph
from tensorflow.python import ops
from tensorflow.tools.graph_transforms import TransformGraph


def get_graph_def_from_saved_model(saved_model_dir): 
  with tf.Session() as session:
    meta_graph_def = tf.saved_model.loader.load(
    session,
    tags=[tf.saved_model.tag_constants.SERVING],
    export_dir=saved_model_dir
  ) 
  return meta_graph_def.graph_def


def describe_graph(graph_def, show_nodes=False):
  print('Input Feature Nodes: {}'.format(
      [node.name for node in graph_def.node if node.op=='Placeholder']))
  print('')
  print('Unused Nodes: {}'.format(
      [node.name for node in graph_def.node if 'unused'  in node.name]))
  print('')
  print('Output Nodes: {}'.format( 
      [node.name for node in graph_def.node if (
          'predictions' in node.name or 'softmax' in node.name)]))
  print('')
  print('Quantization Nodes: {}'.format(
      [node.name for node in graph_def.node if 'quant' in node.name]))
  print('')
  print('Constant Count: {}'.format(
      len([node for node in graph_def.node if node.op=='Const'])))
  print('')
  print('Variable Count: {}'.format(
      len([node for node in graph_def.node if 'Variable' in node.op])))
  print('')
  print('Identity Count: {}'.format(
      len([node for node in graph_def.node if node.op=='Identity'])))
  print('', 'Total nodes: {}'.format(len(graph_def.node)), '')

  if show_nodes==True:
    for node in graph_def.node:
      print('Op:{} - Name: {}'.format(node.op, node.name))

def get_size(model_dir, model_file='saved_model.pb'):
  model_file_path = os.path.join(model_dir, model_file)
  print(model_file_path, '')
  pb_size = os.path.getsize(model_file_path)
  variables_size = 0
  if os.path.exists(
      os.path.join(model_dir,'variables/variables.data-00000-of-00002')):
    variables_size = os.path.getsize(os.path.join(
        model_dir,'variables/variables.data-00001-of-00002'))
    variables_size += os.path.getsize(os.path.join(
        model_dir,'variables/variables.index'))
  print('Model size: {} MB'.format(round(pb_size/(1.049e+6),3)))
  print('Variables size: {} MB'.format(round( variables_size/(1.049e+6),3)))
  print('Total Size: {} MB'.format(round((pb_size + variables_size)/(1.049e+6),3)))


def freeze_model(saved_model_dir, output_node_names, output_filename):
  output_graph_filename = output_filename
  initializer_nodes = ''
  freeze_graph.freeze_graph(
      input_saved_model_dir=saved_model_dir,
      output_graph=output_graph_filename,
      saved_model_tags = tf.saved_model.tag_constants.SERVING,
      output_node_names=output_node_names,
      initializer_nodes=initializer_nodes,
      input_graph=None,
      input_saver=False,
      input_binary=False,
      input_checkpoint=None,
      restore_op_name=False,
      filename_tensor_name=None,
      clear_devices=True,
      input_meta_graph=False,
  )
  print('graph frozen!')


def get_graph_def_from_file(graph_filepath):
  with ops.Graph().as_default():
    with tf.gfile.GFile(graph_filepath, 'rb') as f:
      graph_def = tf.GraphDef()
      graph_def.ParseFromString(f.read())
      return graph_def

def optimize_graph(model_dir, graph_filename,output_dir,optimised_file, transforms, output_node):
  input_names = []
  output_names = [output_node]
  if graph_filename is None:
    graph_def = get_graph_def_from_saved_model(model_dir)
  else:
    graph_def = get_graph_def_from_file(os.path.join(model_dir, graph_filename))
  optimized_graph_def = TransformGraph(
      graph_def,
      input_names,
      output_names,
      transforms)
  tf.train.write_graph(optimized_graph_def,
                      logdir=output_dir,
                      as_text=False,
                      name=optimised_file)
  print('Graph optimized!')


def convert_graph_def_to_saved_model(export_dir, graph_filepath):
  if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
  graph_def = get_graph_def_from_file(graph_filepath)

  legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')

  with tf.Session(graph=tf.Graph()) as session:
    tf.import_graph_def(graph_def, name='')
    tf.saved_model.simple_save(
        session,
        export_dir,
        inputs={
            node.name: session.graph.get_tensor_by_name(
                '{}:0'.format(node.name))
            for node in graph_def.node if node.op=='Placeholder'},
        outputs={'class_ids': session.graph.get_tensor_by_name(
            'dnn/head/predictions/class_ids:0')},
        legacy_init_op=legacy_init_op
    )
  print('Optimized graph converted to SavedModel!')




saved_model_dir="sentiment/1/"
print("*"*100)
print("Pre-frozen")
graph = get_graph_def_from_saved_model(saved_model_dir)
describe_graph(graph)
get_size(saved_model_dir,"saved_model.pb")
print("*"*100)

print("Frozen model: ")
frozen_dir = "frozen/1/"
frozen_file="saved_model.pb"
freeze_model(saved_model_dir,"dnn/head/predictions/class_ids",frozen_dir+frozen_file)
describe_graph(get_graph_def_from_file(frozen_dir+frozen_file))
get_size(frozen_dir,frozen_file)
print("*"*100)

transforms = [
 'remove_nodes(op=Identity)', 
 'merge_duplicate_nodes',
 'strip_unused_nodes',
 'fold_constants(ignore_errors=true)',
 'fold_batch_norms',
 'quantize_nodes',
 'quantize_weights',
 'remove_attribute(attribute_name=_class)' # Solves the expects to be colocated with unknown node error
]
optimised_dir = "optimised/1/"
optimised_file="saved_model.pb"

optimize_graph(frozen_dir, frozen_file ,optimised_dir,optimised_file, transforms, "dnn/head/predictions/class_ids")
describe_graph(get_graph_def_from_file(optimised_dir+optimised_file))
get_size(optimised_dir,optimised_file)
print("*"*100)

print("Convert back to saved model...")
export_dir="final_model/1/"
graph_file="optimised/1/saved_model.pb"
convert_graph_def_to_saved_model(export_dir, graph_file)

它“正常运行”,我的模型现在看起来像:

MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['serving_default']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['headline_placeholder'] tensor_info:
        dtype: DT_STRING
        shape: (-1)
        name: headline_placeholder:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['class_ids'] tensor_info:
        dtype: DT_INT64
        shape: (-1)
        name: dnn/head/predictions/class_ids:0
  Method name is: tensorflow/serving/predict

我之所以说“有效”是因为我必须使用TF 1.14.1-dev20190516。否则存在数据类型问题,资产文件等事实。此外,如果没有remove_attribute(attribute_name=_class)行,则存在“预期与未知节点错误共存”的问题。

好的,现在我有128 MB的saved_model.pb,我尝试将其上传到AI Platform。 但是,当我这样做时,会收到错误消息

Create Version failed. Bad model detected with error: Model requires more memory than allowed. Please try to decrease the model size and re-deploy. If you continue to have error, please contact Cloud ML.

因此,在这一点上,我对如何上传此模型一无所知。有什么想法吗?

0 个答案:

没有答案