我对分布式TensorFlow完全陌生,并尝试使用分布式TensorFlow分发预训练的Keras模型(VGG16)的训练。我遵循了分布式TensorFlow documentation。我使用的是预先训练的VGG16 Keras模型,最后为CIFAR10分类添加了密集层。我按照文档中的说明创建了集群规范和服务器,并尝试在本地计算机上运行它。参数服务器运行正常,但是当我使用python filename.py --job_name worker --task_index 0
运行工作程序时,出现此错误:
InvalidArgumentError(请参见上面的回溯):无法为操作“ block5_conv3 / bias”分配设备:操作已明确分配给/ job:ps / task:0,但可用设备为[/ job:localhost / replica :0 /任务:0 /设备:CPU:0]。确保设备规格引用的是有效设备。 [[节点:block5_conv3 / bias = VariableV2_class = [“ loc:@ block5_conv3 / bias”],容器=“”,dtype = DT_FLOAT,shape = [512],shared_name =“”,_ device =“ / job:ps / task :0“]]
这是错误回溯代码段:
2019-02-08 12:34:33.327746: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-02-08 12:34:33.333854: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:1234}
2019-02-08 12:34:33.333882: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222, 1 -> localhost:2223}
2019-02-08 12:34:33.334152: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:324] Started server with target: grpc://localhost:2222
Traceback (most recent call last):
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1323, in _do_call
return fn(*args)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1293, in _run_fn
self._extend_graph()
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1354, in _extend_graph
self._session, graph_def.SerializeToString(), status)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py", line 473, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot assign a device for operation 'block5_conv3/bias': Operation was explicitly assigned to /job:ps/task:0 but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0 ]. Make sure the device specification refers to a valid device.
[[Node: block5_conv3/bias = VariableV2[_class=["loc:@block5_conv3/bias"], container="", dtype=DT_FLOAT, shape=[512], shared_name="", _device="/job:ps/task:0"]()]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "new.py", line 118, in <module>
main()
File "new.py", line 41, in main
include_top=False)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/keras/_impl/keras/applications/vgg16.py", line 201, in VGG16
model.load_weights(weights_path)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/keras/_impl/keras/engine/topology.py", line 1099, in load_weights
load_weights_from_hdf5_group(f, self.layers)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/keras/_impl/keras/engine/topology.py", line 1486, in load_weights_from_hdf5_group
K.batch_set_value(weight_value_tuples)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/keras/_impl/keras/backend.py", line 2406, in batch_set_value
get_session().run(assign_ops, feed_dict=feed_dict)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/keras/_impl/keras/backend.py", line 376, in get_session
_initialize_variables(session)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/keras/_impl/keras/backend.py", line 554, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars])
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 889, in run
run_metadata_ptr)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1120, in _run
feed_dict_tensor, options, run_metadata)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1317, in _do_run
options, run_metadata)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1336, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot assign a device for operation 'block5_conv3/bias': Operation was explicitly assigned to /job:ps/task:0 but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0 ]. Make sure the device specification refers to a valid device.
[[Node: block5_conv3/bias = VariableV2[_class=["loc:@block5_conv3/bias"], container="", dtype=DT_FLOAT, shape=[512], shared_name="", _device="/job:ps/task:0"]()]]
Caused by op 'block5_conv3/bias', defined at:
File "new.py", line 118, in <module>
main()
File "new.py", line 41, in main
include_top=False)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/keras/_impl/keras/applications/vgg16.py", line 165, in VGG16
512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/keras/_impl/keras/engine/topology.py", line 252, in __call__
output = super(Layer, self).__call__(inputs, **kwargs)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/layers/base.py", line 559, in __call__
self.build(input_shapes[0])
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/layers/convolutional.py", line 151, in build
dtype=self.dtype)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/layers/base.py", line 458, in add_variable
trainable=trainable and self.trainable)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 1203, in get_variable
constraint=constraint)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 1092, in get_variable
constraint=constraint)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 425, in get_variable
constraint=constraint)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 394, in _true_getter
use_resource=use_resource, constraint=constraint)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/variable_scope.py", line 805, in _get_single_variable
constraint=constraint)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 213, in __init__
constraint=constraint)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 309, in _init_from_args
name=name)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/state_ops.py", line 133, in variable_op_v2
shared_name=shared_name)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/gen_state_ops.py", line 927, in _variable_v2
shared_name=shared_name, name=name)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
op_def=op_def)
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'block5_conv3/bias': Operation was explicitly assigned to /job:ps/task:0 but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0 ]. Make sure the device specification refers to a valid device.
[[Node: block5_conv3/bias = VariableV2[_class=["loc:@block5_conv3/bias"], container="", dtype=DT_FLOAT, shape=[512], shared_name="", _device="/job:ps/task:0"]()]]
Exception ignored in: <bound method BaseSession.__del__ of <tensorflow.python.client.session.Session object at 0x1340bcd68>>
Traceback (most recent call last):
File "/anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 696, in __del__
TypeError: 'NoneType' object is not callable
我不确定我在这里想念什么。任何指导/建议将不胜感激。提前致谢。这是我正在使用的代码段:
import tensorflow as tf
import keras
import argparse
import time
FLAGS = None
def main():
#define configuration
config = tf.ConfigProto(log_device_placement=False)
#cluster setup
cluster = tf.train.ClusterSpec({'ps': ['localhost:1234'],
'worker': ['localhost:2222', 'localhost:2223']})
if FLAGS.job_name == 'ps':
server = tf.train.Server(cluster,
job_name='ps',
task_index=FLAGS.task_index,
config=config)
server.join()
else:
is_chief = FLAGS.task_index == 0
server = tf.train.Server(cluster,
job_name='worker',
task_index=FLAGS.task_index,
config=config)
worker_device = 'job:{}/task:{}/cpu:0'.format(FLAGS.job_name, FLAGS.task_index)
with tf.device(tf.train.replica_device_setter(ps_tasks=1,
worker_device=worker_device)):
in_features = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])
targets = tf.placeholder(tf.float32, shape=[None, 10])
base_model = tf.keras.applications.VGG16(weights='imagenet',
include_top=False)
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
pred = tf.keras.layers.Dense(10, activation='softmax')(x)
model = tf.keras.models.Model(inputs=in_features, outputs=pred)
predictions = model.output
loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(targets, predictions))
global_step = tf.contrib.framework.get_or_create_global_step()
optimizer = tf.train.AdagradOptimizer(0.01)
train_op = optimizer.minimize(loss, global_step=global_step)
sync_replica_hook = optimizer.make_session_run_hook(is_chief)
stop_hook = tf.train.StopAtStepHook(last_step=10)
hooks = [sync_replica_hook, stop_hook]
mon_sess = tf.train.MonitoredTrainingSession(master=server.target,
is_chief=is_chief,
config=config,
hooks=hooks,
stop_grace_period_secs=10)
print('Starting Training on worker {}'.format(FLAGS.task_index))
while not mon_sess.should_stop():
keras.backend.set_session(mon_sess)
for epoch in range(5):
running_loss = 0
batch_count = int(x_train.shape[0] / batch_size)
for i in range(batch_count):
batch_x = x_train[i + batch_size, :, :, :]
batch_y = y_train[i + batch_size, :]
_, batch_loss, step = mon_sess.run(train_op, loss, \
global_step, feed_dict={in_features: batch_x, targets: batch_y})
running_loss += batch_loss
epoch_loss = running_loss / batch_count
print('Epoch: {}, Epoch Loss: {}, Global Step: {}').format(epoch, epoch_loss, global_step)
if is_chief:
time.sleep(1)
print('Done {}'.format(FLAGS.task_index))
time.sleep(10)
sess.close()
print('Session from worker {} closed'.format(FLAGS.task_index))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--job_name',
type=str,
default='',
help='Either ps or worker')
parser.add_argument('--task_index',
type=int,
default=0,
help='Task index with job')
FLAGS, unparsed = parser.parse_known_args()
print(FLAGS.task_index)
cifar10 = tf.keras.datasets.cifar10
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
batch_size = 100
main()