我想在tensorflow 1.1.0上试用XLA后端,我是从源代码构建的,以支持XLA编译器。我也在使用Ubuntu 16.04。没有XLA后端,我的模型运行良好。在我的GTX 1080 GPU上计算单个训练步骤大约需要0.8秒。但是,当我启用XLA编译器时,它会在我的模型中第一次调用session.run然后挂起。我的CPU和GPU的使用率几乎为零。
config = tf.ConfigProto()
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
sess = tf.Session(config = config)
m = model.CharacterTranslator(sess, MAX_LENGTH)
m.init_variables()
best_cost = None
m.restore('/media/chase/98d61322-9ea7-473e-b835-8739c77d1e1e/Translator/model.chk')
while True:
#session.run is called inside of here
m.train(random.sample(training_data, 40000), 64, False)
c = m.train(validation_data, 64, True)[0]
if best_cost is None or c < best_cost:
count = 0
best_cost = c
print('Saving...')
m.save('/media/chase/98d61322-9ea7-473e-b835-8739c77d1e1e/Translator/model.chk')
else:
count += 1
if count == 10:
break
...
def train(self, training_data, batch_size, validate = False, verbose = True):
total_cost = 0
total_acc = 0
total_time = 0
last_chars = 0
total_batches = len(training_data) // batch_size
for i, batch in enumerate(_batch(training_data, batch_size, False)):
x, y = zip(*batch)
x, xl = zip(*[self._vectorize_sent(s) for s in x])
y, yl = zip(*[self._vectorize_sent(s) for s in y])
start_time = time.time()
c, a, g, l, _ = self.session.run((self.cost, self.accuracy, self.global_step, self.learning_rate, self.null_train_step if validate else self.train_step), {
self.source_text: x,
self.target_text: y,
self.target_length: yl,
self.teacher_forcing: True,
})
end_time = time.time()
total_cost += c
total_acc += a
total_time += end_time - start_time
if verbose:
msg = '%s b(%d / %d) g(%d) c(%e) a(%0.4f) lr(%e) dt(%0.2f)' % ('Validating' if validate else 'Training', i, total_batches, g, total_cost / (i + 1), total_acc / (i + 1), l, total_time / (i + 1))
msg += ' ' * max(0, last_chars - len(msg))
last_chars = len(msg)
print(msg, end = '\r')
if verbose:
print()
return total_cost / (i + 1), total_acc / (i + 1)
当我尝试运行它时会产生以下张量流输出。
2017-04-26 05:15:58.421388: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2017-04-26 05:15:58.421698: I tensorflow/core/common_runtime/gpu/gpu_device.cc:887] Found device 0 with properties:
name: GeForce GTX 1080
major: 6 minor: 1 memoryClockRate (GHz) 1.7335
pciBusID 0000:01:00.0
Total memory: 7.92GiB
Free memory: 7.33GiB
2017-04-26 05:15:58.421708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:908] DMA: 0
2017-04-26 05:15:58.421711: I tensorflow/core/common_runtime/gpu/gpu_device.cc:918] 0: Y
2017-04-26 05:15:58.421719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py:93: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
"Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2017-04-26 05:17:17.107616: I tensorflow/compiler/xla/service/platform_util.cc:58] platform CUDA present with 1 visible devices
2017-04-26 05:17:17.107635: I tensorflow/compiler/xla/service/platform_util.cc:58] platform Host present with 8 visible devices
2017-04-26 05:17:17.108265: I tensorflow/compiler/xla/service/service.cc:183] XLA service 0xa103840 executing computations on platform Host. Devices:
2017-04-26 05:17:17.108274: I tensorflow/compiler/xla/service/service.cc:191] StreamExecutor device (0): <undefined>, <undefined>
2017-04-26 05:17:17.108393: I tensorflow/compiler/xla/service/platform_util.cc:58] platform CUDA present with 1 visible devices
2017-04-26 05:17:17.108398: I tensorflow/compiler/xla/service/platform_util.cc:58] platform Host present with 8 visible devices
2017-04-26 05:17:17.108602: I tensorflow/compiler/xla/service/service.cc:183] XLA service 0xe383100 executing computations on platform CUDA. Devices:
2017-04-26 05:17:17.108607: I tensorflow/compiler/xla/service/service.cc:191] StreamExecutor device (0): GeForce GTX 1080, Compute Capability 6.1
然后我将gdb附加到进程以查看它正在做什么。看起来它只是坐在pthread条件等待上。
#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1 0x00007f715569291c in std::condition_variable::wait(std::unique_lock<std::mutex>&) ()
from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#2 0x00007f716d85257b in tensorflow::DirectSession::WaitForNotification(tensorflow::Notification*, long long) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#3 0x00007f716d85262d in tensorflow::DirectSession::WaitForNotification(tensorflow::DirectSession::RunState*, tensorflow::CancellationManager*, long long) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#4 0x00007f716d85d287 in tensorflow::DirectSession::Run(tensorflow::RunOptions const&, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, tensorflow::RunMetadata*) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#5 0x00007f716c3259d1 in TF_Run_Helper(tensorflow::Session*, char const*, TF_Buffer const*, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Tensor**, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Buffer*, TF_Status*) [clone .constprop.554] ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#6 0x00007f716c32639a in TF_Run ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#7 0x00007f716c0ab351 in tensorflow::TF_Run_wrapper_helper(TF_DeprecatedSession*, char const*, TF_Buffer const*, _object*, tensorflow::gtl::InlinedVector<char const*, 8> const&, tensorflow::gtl::InlinedVector<char const*, 8> const&, TF_Status*, tensorflow::gtl::InlinedVector<_object*, 8>*, TF_Buffer*) ()
有没有人知道为什么我的XLA后端的tensorflow模型会被卡住?