使用张量流估计器时如何将大型模型保存到s3?

时间:2019-07-05 16:03:09

标签: tensorflow amazon-s3 tensorflow-estimator

我尝试将估算器保存到s3,但因异常而失败 UnknownError (see above for traceback): EntityTooLarge: Unable to parse ExceptionName: EntityTooLarge

我已经尝试过使用分区程序来划分大变量,但仍然是相同的例外。

这是我的代码

import logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow.python.lib.io import file_io


def test_tf_estimator(bucket='tensorflow-testing'):
  data_file = ['s3://%s/input_data.txt'%(bucket)]
  label_name = 'click_label'
  feas = 'fea1,fea2,fea3,fea4,fea5'.split(',')
  input_data_schema = [label_name] + feas
  input_data_dtypes = [ 
    [0.0 if col==label_name else '']
      for col in input_data_schema 
  ]
  input_data_sep = '\t'
  def gen_input_data(line_per_file=2000):
    for path in data_file :
      with file_io.FileIO(path, 'w') as fout:
        for i in range(line_per_file):
          label = int(i % 10 == 0)
          cols = [label] + [ '%s_%s'%(col, i%107) for col in feas ] 
          fout.write('%s\n'%(input_data_sep.join(map(str, cols))))
  gen_input_data()
  def parse_csv(value):
    columns = tf.decode_csv(value, record_defaults=input_data_dtypes, field_delim=input_data_sep, use_quote_delim=False)
    features = dict(zip(input_data_schema, columns))
    label = features[label_name]
    return features, label
  def input_fn(batch_size=1000):
    dataset = tf.data.TextLineDataset(data_file)
    dataset = dataset.prefetch(buffer_size=batch_size*10)
    dataset = dataset.apply(tf.contrib.data.map_and_batch(map_func=parse_csv, num_parallel_batches=4, batch_size=batch_size))
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels
  fids = [ tf.feature_column.categorical_column_with_hash_bucket(fname, 100*1024*1024) for fname in feas ]

  feature_columns = fids
  partitioner = None
  partitioner = tf.fixed_size_partitioner(16, axis=0) 
  estimator = tf.estimator.LinearClassifier(feature_columns, model_dir='s3://%s/model_dir'%(bucket), partitioner=partitioner)
  estimator.train(input_fn=input_fn, steps=1)
  estimator.evaluate(input_fn=input_fn, steps=1)



if __name__ == "__main__":
  test_tf_estimator()

上面的代码导致以下异常

2019-07-05 23:56:40.450188: E tensorflow/core/platform/s3/aws_logging.cc:60] Curl returned error code 28
2019-07-05 23:57:22.245940: E tensorflow/core/platform/s3/aws_logging.cc:60] Curl returned error code 28
Traceback (most recent call last):
  File "test_s3.py", line 82, in <module>
    main()
  File "test_s3.py", line 77, in main
    test_tf_estimator()
  File "test_s3.py", line 67, in test_tf_estimator
    estimator.train(input_fn=input_fn, steps=1)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 356, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1181, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1215, in _train_model_default
    saving_listeners)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1406, in _train_with_estimator_spec
    log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 504, in MonitoredTrainingSession
    stop_grace_period_secs=stop_grace_period_secs)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 921, in __init__
    stop_grace_period_secs=stop_grace_period_secs)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 643, in __init__
    self._sess = _RecoverableSession(self._coordinated_creator)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1107, in __init__
    _WrappedSession.__init__(self, self._create_session())
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1112, in _create_session
    return self._sess_creator.create_session()
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 807, in create_session
    hook.after_create_session(self.tf_sess, self.coord)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 567, in after_create_session
    self._save(session, global_step)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 598, in _save
    self._get_saver().save(session, self._save_path, global_step=step)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1433, in save
    {self.saver_def.filename_tensor_name: checkpoint_file})
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 887, in run
    run_metadata_ptr)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1110, in _run
    feed_dict_tensor, options, run_metadata)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1286, in _do_run
    run_metadata)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1308, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: EntityTooLarge: Unable to parse ExceptionName: EntityTooLarge Message: 
     [[{{node save/SaveV2_1}} = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](save/ShardedFilename_1, save/SaveV2_1/tensor_names, save/SaveV2_1/shape_and_slices, save/Identity_487, save/Identity_489, save/Identity_491, save/Identity_493, save/Identity_495, save/Identity_497, save/Identity_499, save/Identity_501, save/Identity_503, save/Identity_505, save/Identity_507, save/Identity_509, save/Identity_511, save/Identity_513, save/Identity_515, save/Identity_517, save/Identity_519, save/Identity_521, save/Identity_523, save/Identity_525, save/Identity_527, save/Identity_529, save/Identity_531, save/Identity_533, save/Identity_535, save/Identity_537, save/Identity_539, save/Identity_541, save/Identity_543, save/Identity_545, save/Identity_547, save/Identity_549, save/Identity_551, save/Identity_553, save/Identity_555, save/Identity_557, save/Identity_559, save/Identity_561, save/Identity_563, save/Identity_565, save/Identity_567, save/Identity_569, save/Identity_571, save/Identity_573, save/Identity_575, save/Identity_577, save/Identity_579, save/Identity_581, save/Identity_583, save/Identity_585, save/Identity_587, save/Identity_589, save/Identity_591, save/Identity_593, save/Identity_595, save/Identity_597, save/Identity_599, save/Identity_601, save/Identity_603, save/Identity_605, save/Identity_607, save/Identity_609, save/Identity_611, save/Identity_613, save/Identity_615, save/Identity_617, save/Identity_619, save/Identity_621, save/Identity_623, save/Identity_625, save/Identity_627, save/Identity_629, save/Identity_631, save/Identity_633, save/Identity_635, save/Identity_637, save/Identity_639, save/Identity_641, save/Identity_643, save/Identity_645, save/Identity_647, save/Identity_649, save/Identity_651, save/Identity_653, save/Identity_655, save/Identity_657, save/Identity_659, save/Identity_661, save/Identity_663, save/Identity_665, save/Identity_667, save/Identity_669, save/Identity_671, save/Identity_673, save/Identity_675, save/Identity_677, save/Identity_679, save/Identity_681, save/Identity_683, save/Identity_685, save/Identity_687, save/Identity_689, save/Identity_691, save/Identity_693, save/Identity_695, save/Identity_697, save/Identity_699, save/Identity_701, save/Identity_703, save/Identity_705, save/Identity_707, save/Identity_709, save/Identity_711, save/Identity_713, save/Identity_715, save/Identity_717, save/Identity_719, save/Identity_721, save/Identity_723, save/Identity_725, save/Identity_727, save/Identity_729, save/Identity_731, save/Identity_733, save/Identity_735, save/Identity_737, save/Identity_739, save/Identity_741, save/Identity_743, save/Identity_745, save/Identity_747, save/Identity_749, save/Identity_751, save/Identity_753, save/Identity_755, save/Identity_757, save/Identity_759, save/Identity_761, save/Identity_763, save/Identity_765, save/Identity_767, save/Identity_769, save/Identity_771, save/Identity_773, save/Identity_775, save/Identity_777, save/Identity_779, save/Identity_781, save/Identity_783, save/Identity_785, save/Identity_787, save/Identity_789, save/Identity_791, save/Identity_793, save/Identity_795, save/Identity_797, save/Identity_799, save/Identity_801, save/Identity_803, save/Identity_805, save/Identity_807, save/Identity_809, save/Identity_811, save/Identity_813, save/Identity_815, save/Identity_817, save/Identity_819, save/Identity_821, save/Identity_823, save/Identity_825, save/Identity_827, save/Identity_829, save/Identity_831, save/Identity_833, save/Identity_835, save/Identity_837, save/Identity_839, save/Identity_841, save/Identity_843, save/Identity_845, save/Identity_847, save/Identity_849, save/Identity_851, save/Identity_853, save/Identity_855, save/Identity_857, save/Identity_859, save/Identity_861, save/Identity_863, save/Identity_865, save/Identity_867, save/Identity_869, save/Identity_871, save/Identity_873, save/Identity_875, save/Identity_877, save/Identity_879, save/Identity_881, save/Identity_883, save/Identity_885, save/Identity_887, save/Identity_889, save/Identity_891, save/Identity_893, save/Identity_895, save/Identity_897, save/Identity_899, save/Identity_901, save/Identity_903, save/Identity_905, save/Identity_907, save/Identity_909, save/Identity_911, save/Identity_913, save/Identity_915, save/Identity_917, save/Identity_919, save/Identity_921, save/Identity_923, save/Identity_925, save/Identity_927, save/Identity_929, save/Identity_931, save/Identity_933, save/Identity_935, save/Identity_937, save/Identity_939, save/Identity_941, save/Identity_943, save/Identity_945, save/Identity_947, save/Identity_949, save/Identity_951, save/Identity_953, save/Identity_955, save/Identity_957, save/Identity_959, save/Identity_961, save/Identity_963, save/Identity_965, save/Identity_967, save/Identity_969, save/Identity_971)]]

Caused by op u'save/SaveV2_1', defined at:
  File "test_s3.py", line 82, in <module>
    main()
  File "test_s3.py", line 77, in main
    test_tf_estimator()
  File "test_s3.py", line 67, in test_tf_estimator
    estimator.train(input_fn=input_fn, steps=1)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 356, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1181, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1215, in _train_model_default
    saving_listeners)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1406, in _train_with_estimator_spec
    log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 504, in MonitoredTrainingSession
    stop_grace_period_secs=stop_grace_period_secs)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 921, in __init__
    stop_grace_period_secs=stop_grace_period_secs)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 643, in __init__
    self._sess = _RecoverableSession(self._coordinated_creator)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1107, in __init__
    _WrappedSession.__init__(self, self._create_session())
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1112, in _create_session
    return self._sess_creator.create_session()
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 800, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 557, in create_session
    self._scaffold.finalize()
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 215, in finalize
    self._saver.build()
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1106, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1143, in _build
    build_save=build_save, build_restore=build_restore)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 778, in _build_internal
    save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 369, in _AddShardedSaveOps
    return self._AddShardedSaveOpsForV2(filename_tensor, per_device)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 343, in _AddShardedSaveOpsForV2
    sharded_saves.append(self._AddSaveOps(sharded_filename, saveables))
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 284, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 202, in save_op
    tensors)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1690, in save_v2
    shape_and_slices=shape_and_slices, tensors=tensors, name=name)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3272, in create_op
    op_def=op_def)
  File "/data/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1768, in __init__
    self._traceback = tf_stack.extract_stack()

UnknownError (see above for traceback): EntityTooLarge: Unable to parse ExceptionName: EntityTooLarge Message: 
     [[{{node save/SaveV2_1}} = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](save/ShardedFilename_1, save/SaveV2_1/tensor_names, save/SaveV2_1/shape_and_slices, save/Identity_487, save/Identity_489, save/Identity_491, save/Identity_493, save/Identity_495, save/Identity_497, save/Identity_499, save/Identity_501, save/Identity_503, save/Identity_505, save/Identity_507, save/Identity_509, save/Identity_511, save/Identity_513, save/Identity_515, save/Identity_517, save/Identity_519, save/Identity_521, save/Identity_523, save/Identity_525, save/Identity_527, save/Identity_529, save/Identity_531, save/Identity_533, save/Identity_535, save/Identity_537, save/Identity_539, save/Identity_541, save/Identity_543, save/Identity_545, save/Identity_547, save/Identity_549, save/Identity_551, save/Identity_553, save/Identity_555, save/Identity_557, save/Identity_559, save/Identity_561, save/Identity_563, save/Identity_565, save/Identity_567, save/Identity_569, save/Identity_571, save/Identity_573, save/Identity_575, save/Identity_577, save/Identity_579, save/Identity_581, save/Identity_583, save/Identity_585, save/Identity_587, save/Identity_589, save/Identity_591, save/Identity_593, save/Identity_595, save/Identity_597, save/Identity_599, save/Identity_601, save/Identity_603, save/Identity_605, save/Identity_607, save/Identity_609, save/Identity_611, save/Identity_613, save/Identity_615, save/Identity_617, save/Identity_619, save/Identity_621, save/Identity_623, save/Identity_625, save/Identity_627, save/Identity_629, save/Identity_631, save/Identity_633, save/Identity_635, save/Identity_637, save/Identity_639, save/Identity_641, save/Identity_643, save/Identity_645, save/Identity_647, save/Identity_649, save/Identity_651, save/Identity_653, save/Identity_655, save/Identity_657, save/Identity_659, save/Identity_661, save/Identity_663, save/Identity_665, save/Identity_667, save/Identity_669, save/Identity_671, save/Identity_673, save/Identity_675, save/Identity_677, save/Identity_679, save/Identity_681, save/Identity_683, save/Identity_685, save/Identity_687, save/Identity_689, save/Identity_691, save/Identity_693, save/Identity_695, save/Identity_697, save/Identity_699, save/Identity_701, save/Identity_703, save/Identity_705, save/Identity_707, save/Identity_709, save/Identity_711, save/Identity_713, save/Identity_715, save/Identity_717, save/Identity_719, save/Identity_721, save/Identity_723, save/Identity_725, save/Identity_727, save/Identity_729, save/Identity_731, save/Identity_733, save/Identity_735, save/Identity_737, save/Identity_739, save/Identity_741, save/Identity_743, save/Identity_745, save/Identity_747, save/Identity_749, save/Identity_751, save/Identity_753, save/Identity_755, save/Identity_757, save/Identity_759, save/Identity_761, save/Identity_763, save/Identity_765, save/Identity_767, save/Identity_769, save/Identity_771, save/Identity_773, save/Identity_775, save/Identity_777, save/Identity_779, save/Identity_781, save/Identity_783, save/Identity_785, save/Identity_787, save/Identity_789, save/Identity_791, save/Identity_793, save/Identity_795, save/Identity_797, save/Identity_799, save/Identity_801, save/Identity_803, save/Identity_805, save/Identity_807, save/Identity_809, save/Identity_811, save/Identity_813, save/Identity_815, save/Identity_817, save/Identity_819, save/Identity_821, save/Identity_823, save/Identity_825, save/Identity_827, save/Identity_829, save/Identity_831, save/Identity_833, save/Identity_835, save/Identity_837, save/Identity_839, save/Identity_841, save/Identity_843, save/Identity_845, save/Identity_847, save/Identity_849, save/Identity_851, save/Identity_853, save/Identity_855, save/Identity_857, save/Identity_859, save/Identity_861, save/Identity_863, save/Identity_865, save/Identity_867, save/Identity_869, save/Identity_871, save/Identity_873, save/Identity_875, save/Identity_877, save/Identity_879, save/Identity_881, save/Identity_883, save/Identity_885, save/Identity_887, save/Identity_889, save/Identity_891, save/Identity_893, save/Identity_895, save/Identity_897, save/Identity_899, save/Identity_901, save/Identity_903, save/Identity_905, save/Identity_907, save/Identity_909, save/Identity_911, save/Identity_913, save/Identity_915, save/Identity_917, save/Identity_919, save/Identity_921, save/Identity_923, save/Identity_925, save/Identity_927, save/Identity_929, save/Identity_931, save/Identity_933, save/Identity_935, save/Identity_937, save/Identity_939, save/Identity_941, save/Identity_943, save/Identity_945, save/Identity_947, save/Identity_949, save/Identity_951, save/Identity_953, save/Identity_955, save/Identity_957, save/Identity_959, save/Identity_961, save/Identity_963, save/Identity_965, save/Identity_967, save/Identity_969, save/Identity_971)]]

0 个答案:

没有答案