Question

我想在tensorflow 1.1.0上试用XLA后端，我是从源代码构建的，以支持XLA编译器。我也在使用Ubuntu 16.04。没有XLA后端，我的模型运行良好。在我的GTX 1080 GPU上计算单个训练步骤大约需要0.8秒。但是，当我启用XLA编译器时，它会在我的模型中第一次调用session.run然后挂起。我的CPU和GPU的使用率几乎为零。

config = tf.ConfigProto()
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
sess = tf.Session(config = config)
m = model.CharacterTranslator(sess, MAX_LENGTH)
m.init_variables()

best_cost = None
m.restore('/media/chase/98d61322-9ea7-473e-b835-8739c77d1e1e/Translator/model.chk')
while True:
    #session.run is called inside of here
    m.train(random.sample(training_data, 40000), 64, False)
    c = m.train(validation_data, 64, True)[0]
    if best_cost is None or c < best_cost:
        count = 0
        best_cost = c
        print('Saving...')
        m.save('/media/chase/98d61322-9ea7-473e-b835-8739c77d1e1e/Translator/model.chk')
    else:
        count += 1
        if count == 10:
            break


...
    def train(self, training_data, batch_size, validate = False, verbose = True):
            total_cost = 0
            total_acc = 0
            total_time = 0
            last_chars = 0
            total_batches = len(training_data) // batch_size
            for i, batch in enumerate(_batch(training_data, batch_size, False)):
                x, y = zip(*batch)
                x, xl = zip(*[self._vectorize_sent(s) for s in x])
                y, yl = zip(*[self._vectorize_sent(s) for s in y])
                start_time = time.time()
                c, a, g, l, _ = self.session.run((self.cost, self.accuracy, self.global_step, self.learning_rate, self.null_train_step if validate else self.train_step), {
                    self.source_text: x,
                    self.target_text: y,
                    self.target_length: yl,
                    self.teacher_forcing: True,
                })
                end_time = time.time()
                total_cost += c
                total_acc += a
                total_time += end_time - start_time
                if verbose:
                    msg = '%s b(%d / %d) g(%d) c(%e) a(%0.4f) lr(%e) dt(%0.2f)' % ('Validating' if validate else 'Training', i, total_batches, g, total_cost / (i + 1), total_acc / (i + 1), l, total_time / (i + 1))
                    msg += ' ' * max(0, last_chars - len(msg))
                    last_chars = len(msg)
                    print(msg, end = '\r')
            if verbose: 
                print()
            return total_cost / (i + 1), total_acc / (i + 1)

当我尝试运行它时会产生以下张量流输出。

2017-04-26 05:15:58.421388: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2017-04-26 05:15:58.421698: I tensorflow/core/common_runtime/gpu/gpu_device.cc:887] Found device 0 with properties: 
name: GeForce GTX 1080
major: 6 minor: 1 memoryClockRate (GHz) 1.7335
pciBusID 0000:01:00.0
Total memory: 7.92GiB
Free memory: 7.33GiB
2017-04-26 05:15:58.421708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:908] DMA: 0 
2017-04-26 05:15:58.421711: I tensorflow/core/common_runtime/gpu/gpu_device.cc:918] 0:   Y 
2017-04-26 05:15:58.421719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py:93: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2017-04-26 05:17:17.107616: I tensorflow/compiler/xla/service/platform_util.cc:58] platform CUDA present with 1 visible devices
2017-04-26 05:17:17.107635: I tensorflow/compiler/xla/service/platform_util.cc:58] platform Host present with 8 visible devices
2017-04-26 05:17:17.108265: I tensorflow/compiler/xla/service/service.cc:183] XLA service 0xa103840 executing computations on platform Host. Devices:
2017-04-26 05:17:17.108274: I tensorflow/compiler/xla/service/service.cc:191]   StreamExecutor device (0): <undefined>, <undefined>
2017-04-26 05:17:17.108393: I tensorflow/compiler/xla/service/platform_util.cc:58] platform CUDA present with 1 visible devices
2017-04-26 05:17:17.108398: I tensorflow/compiler/xla/service/platform_util.cc:58] platform Host present with 8 visible devices
2017-04-26 05:17:17.108602: I tensorflow/compiler/xla/service/service.cc:183] XLA service 0xe383100 executing computations on platform CUDA. Devices:
2017-04-26 05:17:17.108607: I tensorflow/compiler/xla/service/service.cc:191]   StreamExecutor device (0): GeForce GTX 1080, Compute Capability 6.1

然后我将gdb附加到进程以查看它正在做什么。看起来它只是坐在pthread条件等待上。

#0  pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1  0x00007f715569291c in std::condition_variable::wait(std::unique_lock<std::mutex>&) ()
   from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#2  0x00007f716d85257b in tensorflow::DirectSession::WaitForNotification(tensorflow::Notification*, long long) ()
   from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#3  0x00007f716d85262d in tensorflow::DirectSession::WaitForNotification(tensorflow::DirectSession::RunState*, tensorflow::CancellationManager*, long long) ()
   from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#4  0x00007f716d85d287 in tensorflow::DirectSession::Run(tensorflow::RunOptions const&, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, tensorflow::RunMetadata*) ()
   from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#5  0x00007f716c3259d1 in TF_Run_Helper(tensorflow::Session*, char const*, TF_Buffer const*, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Tensor**, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Buffer*, TF_Status*) [clone .constprop.554] ()
   from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#6  0x00007f716c32639a in TF_Run ()
   from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#7  0x00007f716c0ab351 in tensorflow::TF_Run_wrapper_helper(TF_DeprecatedSession*, char const*, TF_Buffer const*, _object*, tensorflow::gtl::InlinedVector<char const*, 8> const&, tensorflow::gtl::InlinedVector<char const*, 8> const&, TF_Status*, tensorflow::gtl::InlinedVector<_object*, 8>*, TF_Buffer*) ()

有没有人知道为什么我的XLA后端的tensorflow模型会被卡住？

使用XLA的Tensorflow与GPU和CPU一起使用，使用率约为0％

0 个答案: