我有一个tf.data.Dataset
对象,它是与tf.estimator
一起使用的input_fn的输出。下面的琐碎示例:
import os
import tensorflow as tf
import numpy as np
btr = tf.estimator.BoostedTreesRegressor(
feature_columns=[tf.feature_column.numeric_column('x', shape=(3,))]
,n_batches_per_layer=1
,model_dir=os.path.join(os.getcwd(), 'btr_classifier'))
def input_fn():
# accommodate variable length inputs
def generate_values():
for i in range(50000):
x_len = np.random.randint(1,4)
x = np.random.rand(x_len, )
y = np.sum(x)
yield (x, y)
# add a 'key' for feature_columns
def to_dict(x, y):
return {"x":x}, y
# create a dataset from generator with padded_batch
def make_dataset(batch_size=32):
output_types = ((tf.float32), tf.float32)
padded_shapes = (([None]), [])
dataset = (tf.data.Dataset
.from_generator(generate_values, output_types=output_types)
.padded_batch(batch_size, padded_shapes=padded_shapes)
.repeat()
.map(to_dict)
)
return dataset
def _input_fn():
dataset = make_dataset()
return dataset
return _input_fn
btr.train(input_fn=input_fn())
这将导致以下错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-d7ca9ca451db> in <module>
31 return _input_fn
32
---> 33 btr.train(input_fn=input_fn())
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
365
366 saving_listeners = _check_listeners_type(saving_listeners)
--> 367 loss = self._train_model(input_fn, hooks, saving_listeners)
368 logging.info('Loss for final step: %s.', loss)
369 return self
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
1156 return self._train_model_distributed(input_fn, hooks, saving_listeners)
1157 else:
-> 1158 return self._train_model_default(input_fn, hooks, saving_listeners)
1159
1160 def _train_model_default(self, input_fn, hooks, saving_listeners):
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _train_model_default(self, input_fn, hooks, saving_listeners)
1186 worker_hooks.extend(input_hooks)
1187 estimator_spec = self._call_model_fn(
-> 1188 features, labels, ModeKeys.TRAIN, self.config)
1189 global_step_tensor = training_util.get_global_step(g)
1190 return self._train_with_estimator_spec(estimator_spec, worker_hooks,
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _call_model_fn(self, features, labels, mode, config)
1144
1145 logging.info('Calling model_fn.')
-> 1146 model_fn_results = self._model_fn(features=features, **kwargs)
1147 logging.info('Done calling model_fn.')
1148
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow_estimator\python\estimator\canned\boosted_trees.py in _model_fn(features, labels, mode, config)
1932 config,
1933 weight_column=weight_column,
-> 1934 train_in_memory=train_in_memory)
1935
1936 super(BoostedTreesRegressor, self).__init__(
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow_estimator\python\estimator\canned\boosted_trees.py in _bt_model_fn(features, labels, mode, head, feature_columns, tree_hparams, n_batches_per_layer, config, closed_form_grad_and_hess_fn, example_id_column_name, weight_column, train_in_memory, name)
1075 else:
1076 input_feature_list = _get_transformed_features(
-> 1077 features, sorted_feature_columns, bucket_boundaries_dict)
1078 if example_id_column_name:
1079 example_ids = features[example_id_column_name]
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow_estimator\python\estimator\canned\boosted_trees.py in _get_transformed_features(features, sorted_feature_columns, bucket_boundaries_dict)
102 return _get_transformed_features_and_merge_with_previously_transformed(
103 features, sorted_feature_columns, sorted_feature_columns,
--> 104 bucket_boundaries_dict)
105
106
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow_estimator\python\estimator\canned\boosted_trees.py in _get_transformed_features_and_merge_with_previously_transformed(features, sorted_feature_columns, all_sorted_columns, bucket_boundaries_dict, already_transformed_features)
171 'than 2, but column `{}` got: {}'.format(
172 source_name, column.shape))
--> 173 unstacked = array_ops.unstack(tensor, axis=1)
174 if not bucket_boundaries_dict:
175 result_features.extend(unstacked)
~\AppData\Local\Continuum\anaconda3\envs\tensorflow_gpuenv\lib\site-packages\tensorflow\python\ops\array_ops.py in unstack(value, num, axis, name)
1198 num = value_shape.dims[axis].value
1199 if num is None:
-> 1200 raise ValueError("Cannot infer num from shape %s" % value_shape)
1201 return gen_array_ops.unpack(value, num=num, axis=axis, name=name)
1202
ValueError: Cannot infer num from shape (?, ?)
我能够用稍微修改的版本训练tf.keras模型,其中没有必需的feature_columns
参数的格式。我是否在variant_dataset'x'上贴标签不正确?我希望这像其他任何数据集api一样简单地传递给估算器。我不太熟悉并试图变得更熟悉的部分是使用padded_batch
,如果还有其他考虑,则在使用时可能需要考虑。