我试图使用tf估计器来建立逻辑回归模型。我使用了虹膜数据集,该数据集在我的计算机中成功运行。但是,当我尝试在集群中使用该模型(使用train_and_evaluate而不是classfier.train)时,遇到了这个问题。
python版本:3.6.8 tensorflow版本:1.13.1
以下是在本地运行的代码:
iris数据集仅包含数字数据。因此feature_columns是NumericColumn的列表。
FUTURES = ['SepalLength', 'SepalWidth','PetalLength', 'PetalWidth', 'Species']
feature_columns = []
for key in FUTURES:
feature_columns.append(tf.feature_column.numeric_column(key=key))
定义估算器。将feature_columns传递给参数
classifier = tf.estimator.Estimator(
model_fn=my_model_fn,
model_dir=models_path,
params={
'feature_columns': feature_columns,
'n_classes': 3,
})
定义model_fn。
def my_model_fn(features,labels,mode,params):
net = tf.feature_column.input_layer(features, params['feature_columns'])
logits = tf.layers.dense(net, params['n_classes'], activation=None)
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {'logits': logits}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss,global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
accuracy = tf.metrics.accuracy(labels=labels,
predictions=predicted_classes,
name='acc_op')
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
此代码可以很好地工作并产生一些结果。
-------------------------------------------------------------
然后,我想对它进行群集训练。 my_model_fn与上一个相同,并且self._feature_numeric_col仍然是NumericColumn的列表。
class LogisticReg():
def __init__(self):
self._feature_col = x.columns.tolist()
self._feature_numeric_col = []
for key in self._feature_col:
self._feature_numeric_col.append(tf.feature_column.numeric_column(key=key))
self.estimator = tf.estimator.Estimator(model_fn=self.my_model_fn,
model_dir=self.model_path,
config=self.config,
params={'feature_columns':self._feature_numeric_col})
def my_model_fn(self, features, labels, mode, params):
net = tf.feature_column.input_layer(features, params['feature_columns'])
logits = tf.layers.dense(net, self.n_class, activation=None)
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {'logits': logits}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) !
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
accuracy = tf.metrics.accuracy(labels=labels,predictions=predicted_classes)
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
使用train_and_evaluate函数代替train / eval / predict
# input_fn
def input_fn(self, X, y, mode, batch_size):
y = y.astype(np.int32)
X = X.astype(np.float32)
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)) # x,y:pandas
if mode == 'train':
dataset = dataset.shuffle(500)
dataset = dataset.repeat()
dataset = dataset.batch(batch_size)
return dataset
# train_spec
train_spec = tf.estimator.TrainSpec(input_fn=lambda: self.input_fn(x_train,y_train,'train',batch_size),
max_steps=n_epochs)
# eval_spec
eval_spec = tf.estimator.EvalSpec(input_fn=lambda: self.input_fn(x_valid, y_valid, 'valid', batch_size),
start_delay_secs=30, throttle_secs=30, steps=None)
tf.estimator.train_and_evaluate(self.estimator, train_spec, eval_spec)
我希望集群版本可以生成与本地版本相似的输出。但是,出现此错误。
Traceback (most recent call last):
File "/usr/local/bin/python3/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/usr/local/bin/python3/lib/python3.6/site-packages/tensorflow/python/distribute/mirrored_strategy.py", line 852, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
File "/usr/local/bin/python3/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1112, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/mnt/glusterfs/model-center/train/classify.py", line 51, in my_model_fn
net = tf.feature_column.input_layer(features, params['feature_columns'])
File "/usr/local/bin/python3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py", line 302, in input_layer
cols_to_output_tensors=cols_to_output_tensors)
File "/usr/local/bin/python3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py", line 181, in _internal_input_layer
feature_columns = _normalize_feature_columns(feature_columns)
File "/usr/local/bin/python3/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column.py", line 2263, in _normalize_feature_columns
'Given (type {}): {}.'.format(type(column), column))
ValueError: Items of feature_columns must be a _FeatureColumn. Given (type <class 'collections.NumericColumn'>): NumericColumn(key='sepal_length', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None).