Question

尝试使LinearClassifier与Colab TPU一起运行。 https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/estimator/LinearClassifier

TensorFlow 2.0 Estimator（LinearClassifier）支持TPUStrategy https://www.tensorflow.org/beta/guide/distribute_strategy#whats_supported_now_2

LinearClassifier在没有tpu_strategy的情况下可以正常工作。 https://www.tensorflow.org/beta/guide/distribute_strategy#tpustrategy

将tpu_strategy添加为LinearClassifier的配置时，出现以下错误：

InvalidArgumentError：未注册任何OpKernel支持{{node input0}}使用的Op'TPUReplicatedInput'，它们具有以下属性：[T = DT_DOUBLE，N = 8] 注册的设备：[CPU，XLA_CPU] 注册内核： [[input0]] https://www.tensorflow.org/beta/guide/distribute_strategy#using_tfdistributestrategy_with_estimator

已经战斗了几天，这是怎么了？

!pip install tensorflow==2.0.0-beta0
import tensorflow.feature_column as fc
import tensorflow as tf
import os

print(tf.__version__)

TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_WORKER)
tf.config.experimental_connect_to_host(cluster_resolver.master())
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
config = tf.estimator.RunConfig(train_distribute=tpu_strategy, eval_distribute=tpu_strategy)

batch_size = 1
def make_input_fn(X, y):
  def input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)).batch(batch_size)
    return dataset
  return input_fn

input_fn = make_input_fn(estimator_train_attributes_dictionary,labels_train)
linear_est = tf.estimator.LinearClassifier(feature_columns=attibute_columns,config=config)
linear_est.train(input_fn=input_fn)

Colab中的完全例外：

W0618 18:08:10.280844 140506166175616 estimator.py:1811] Using temporary folder as model directory: /tmp/tmp2xc1fixj
2.0.0-beta0
W0618 18:09:00.986362 140506166175616 tpu.py:218] 3 unsupported operations found: 
  ScalarSummary (bias)
  ScalarSummary (fraction_of_zero_weights)
  ScalarSummary (loss)
W0618 18:09:43.578035 140506166175616 tpu_strategy_util.py:57] TPU system %s has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.
---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1355     try:
-> 1356       return fn(*args)
   1357     except errors.OpError as e:

20 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
   1338       # Ensure any changes to the graph are reflected in the runtime.
-> 1339       self._extend_graph()
   1340       return self._call_tf_sessionrun(

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _extend_graph(self)
   1373     with self._graph._session_run_lock():  # pylint: disable=protected-access
-> 1374       tf_session.ExtendSession(self._session)
   1375 

InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by {{node input0}}with these attrs: [T=DT_DOUBLE, N=8]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
  <no registered kernels>

     [[input0]]

During handling of the above exception, another exception occurred:

InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-23-66caf93d8677> in <module>()
     25 
     26 linear_est = tf.estimator.LinearClassifier(feature_columns=attibute_columns,config=config)#feature_columns=featureNames,,config=config
---> 27 linear_est.train(input_fn=input_fn)#,max_steps=100
     28 
     29 #train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=1000)

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
    365 
    366       saving_listeners = _check_listeners_type(saving_listeners)
--> 367       loss = self._train_model(input_fn, hooks, saving_listeners)
    368       logging.info('Loss for final step: %s.', loss)
    369       return self

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
   1154   def _train_model(self, input_fn, hooks, saving_listeners):
   1155     if self._train_distribution:
-> 1156       return self._train_model_distributed(input_fn, hooks, saving_listeners)
   1157     else:
   1158       return self._train_model_default(input_fn, hooks, saving_listeners)

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model_distributed(self, input_fn, hooks, saving_listeners)
   1217       self._config._train_distribute.configure(self._config.session_config)
   1218       return self._actual_train_model_distributed(
-> 1219           self._config._train_distribute, input_fn, hooks, saving_listeners)
   1220     # pylint: enable=protected-access
   1221 

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _actual_train_model_distributed(self, strategy, input_fn, hooks, saving_listeners)
   1327         return self._train_with_estimator_spec(estimator_spec, worker_hooks,
   1328                                                hooks, global_step_tensor,
-> 1329                                                saving_listeners)
   1330 
   1331   def _train_with_estimator_spec_distributed(self, estimator_spec, worker_hooks,

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks, global_step_tensor, saving_listeners)
   1478         save_summaries_steps=save_summary_steps,
   1479         config=self._session_config,
-> 1480         log_step_count_steps=log_step_count_steps) as mon_sess:
   1481       loss = None
   1482       any_step_done = False

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in MonitoredTrainingSession(master, is_chief, checkpoint_dir, scaffold, hooks, chief_only_hooks, save_checkpoint_secs, save_summaries_steps, save_summaries_secs, config, stop_grace_period_secs, log_step_count_steps, max_wait_secs, save_checkpoint_steps, summary_dir)
    582       session_creator=session_creator,
    583       hooks=all_hooks,
--> 584       stop_grace_period_secs=stop_grace_period_secs)
    585 
    586 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, stop_grace_period_secs)
   1005         hooks,
   1006         should_recover=True,
-> 1007         stop_grace_period_secs=stop_grace_period_secs)
   1008 
   1009 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, should_recover, stop_grace_period_secs)
    723         stop_grace_period_secs=stop_grace_period_secs)
    724     if should_recover:
--> 725       self._sess = _RecoverableSession(self._coordinated_creator)
    726     else:
    727       self._sess = self._coordinated_creator.create_session()

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, sess_creator)
   1198     """
   1199     self._sess_creator = sess_creator
-> 1200     _WrappedSession.__init__(self, self._create_session())
   1201 
   1202   def _create_session(self):

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in _create_session(self)
   1203     while True:
   1204       try:
-> 1205         return self._sess_creator.create_session()
   1206       except _PREEMPTION_ERRORS as e:
   1207         logging.info(

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in create_session(self)
    869       """Creates a coordinated session."""
    870       # Keep the tf_sess for unit testing.
--> 871       self.tf_sess = self._session_creator.create_session()
    872       # We don't want coordinator to suppress any exception.
    873       self.coord = coordinator.Coordinator(clean_stop_exception_types=[])

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in create_session(self)
    645         init_op=self._scaffold.init_op,
    646         init_feed_dict=self._scaffold.init_feed_dict,
--> 647         init_fn=self._scaffold.init_fn)
    648 
    649 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/session_manager.py in prepare_session(self, master, init_op, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config, init_feed_dict, init_fn)
    294                            "init_fn or local_init_op was given")
    295       if init_op is not None:
--> 296         sess.run(init_op, feed_dict=init_feed_dict)
    297       if init_fn:
    298         init_fn(sess)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    948     try:
    949       result = self._run(None, fetches, feed_dict, options_ptr,
--> 950                          run_metadata_ptr)
    951       if run_metadata:
    952         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1171     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1172       results = self._do_run(handle, final_targets, final_fetches,
-> 1173                              feed_dict_tensor, options, run_metadata)
   1174     else:
   1175       results = []

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1348     if handle is None:
   1349       return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1350                            run_metadata)
   1351     else:
   1352       return self._do_call(_prun_fn, handle, feeds, fetches)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1368           pass
   1369       message = error_interpolation.interpolate(message, self._graph)
-> 1370       raise type(e)(node_def, op, message)
   1371 
   1372   def _extend_graph(self):


InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by node input0 (defined at <ipython-input-23-66caf93d8677>:27) with these attrs: [T=DT_DOUBLE, N=8]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
  <no registered kernels>

     [[input0]]

Answer 1

支持在colab中安装TF 2.0（使用!pip命令），但是TPU本身不会获得已安装的TF版本。如果要在colab中使用TPU，请使用默认提供的TF版本。

Colab TPU：TensorFlow'2.0.0-beta0'LinearClassifier .train错误

1 个答案: