无法调用cudnnRNNBackwardData:CUDNN_STATUS_INTERNAL_ERROR

时间:2018-02-14 16:52:51

标签: python tensorflow cudnn

系统信息

  • 我是否编写了自定义代码:是
  • 操作系统平台和分发:Ubuntu 16.04
  • :二进制
  • 安装的TensorFlow
  • TensorFlow版:1.4.1
  • Python版:3.5.2
  • Bazel版本:未从源代码
  • 编译
  • GCC /编译器版本:未从源代码
  • 编译
  • CUDA / cuDNN版本:8.0 / v6
  • GPU型号和内存:GeForce GTX 1080(8GB x 4)

代码开始训练,在批次上进行任意次数的迭代后,它会出现以下错误:

2018-02-14 23:51:31.591963: E tensorflow/stream_executor/cuda/cuda_event.cc:49] Error polling for event status: failed to query event: CUDA_ERROR_ILLEGAL_ADDRESS
2018-02-14 23:51:31.592000: F tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc:203] Unexpected Event status: 1
2018-02-14 23:51:31.592023: E tensorflow/stream_executor/cuda/cuda_dnn.cc:1679] Failed to call cudnnRNNBackwardData: CUDNN_STATUS_INTERNAL_ERROR
Aborted (core dumped)

我在cudnn_gru控制流操作中使用tf.while_loop并使用tf.while范围之外的初始值设定项,因为变量不允许在tf.while范围内实例化{1}}范围:

import tensorflow as tf

gru_fw = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=1, num_units=150, input_size=500)
gru_fw_1 = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=1, num_units=150, input_size=1800)
e = tf.random_uniform([gru_fw.params_size()], -0.1, 0.1)
f = tf.random_uniform([gru_fw.params_size()], -0.1, 0.1)
g = tf.zeros([1, 4, 150])
h = tf.zeros([1, 4, 150])
zeros_i = tf.zeros([4, 150])

class cudnn_gru:
    def __init__(self, num_layers, num_units, batch_size, input_size, keep_prob=1.0, is_train=None, scope=None):
        self.num_layers = num_layers
        self.grus = []
        self.params = []
        self.inits = []
        self.dropout_mask = []
        for layer in range(num_layers):
            input_size_ = input_size if layer == 0 else 2 * num_units
            gru_fw = tf.contrib.cudnn_rnn.CudnnGRU(
                num_layers=1, num_units=num_units, input_size=input_size_)
            gru_bw = tf.contrib.cudnn_rnn.CudnnGRU(
                num_layers=1, num_units=num_units, input_size=input_size_)
            with tf.variable_scope('CUDNN_GRU', reuse=tf.AUTO_REUSE):
                param_fw = tf.get_variable("param_fw",initializer=e,validate_shape=False)
                param_bw = tf.get_variable("param_bw",initializer=f,validate_shape=False)
                init_fw = tf.get_variable("init_fw", initializer=g)
                init_bw = tf.get_variable("init_bw", initializer=h)
    def __call__(self, inputs, seq_len, keep_prob=1.0, is_train=None, concat_layers=True):
        outputs = [tf.transpose(inputs, [1, 0, 2])]
        for layer in range(self.num_layers):
            gru_fw, gru_bw = self.grus[layer]
            param_fw, param_bw = self.params[layer]
            init_fw, init_bw = self.inits[layer]
            mask_fw, mask_bw = self.dropout_mask[layer]
            with tf.variable_scope("fw"):
                out_fw, _ = gru_fw(outputs[-1] * mask_fw, init_fw, param_fw)
            with tf.variable_scope("bw"):
                inputs_bw = tf.reverse_sequence(
                    outputs[-1] * mask_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1)
                out_bw, _ = gru_bw(inputs_bw, init_bw, param_bw)
                out_bw = tf.reverse_sequence(
                    out_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1)
            outputs.append(tf.concat([out_fw, out_bw], axis=2))
        if concat_layers:
            res = tf.concat(outputs[1:], axis=2)
        else:
            res = outputs[-1]
        res = tf.transpose(res, [1, 0, 2])
        return res

class native_gru:
    def __init__(self, num_layers, num_units, batch_size, input_size, keep_prob=1.0, is_train=None, scope="native_gru"):
        self.num_layers = num_layers
        self.grus = []
        self.inits = []
        self.dropout_mask = []
        self.scope = scope
        for layer in range(num_layers):
            input_size_ = input_size if layer == 0 else 2 * num_units
            gru_fw = tf.contrib.rnn.GRUCell(num_units)
            gru_bw = tf.contrib.rnn.GRUCell(num_units)
            with tf.variable_scope('native_GRU', reuse=tf.AUTO_REUSE):

                init_fw = tf.get_variable("init_fw", initializer=zeros_i)
                init_bw = tf.get_variable("init_bw", initializer=zeros_i)

            #init_fw = tf.Variable(tf.zeros([batch_size, num_units]))
            #init_bw = tf.Variable(tf.zeros([batch_size, num_units]))
            mask_fw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32),
                              keep_prob=keep_prob, is_train=is_train, mode=None)
            mask_bw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32),
                              keep_prob=keep_prob, is_train=is_train, mode=None)
            self.grus.append((gru_fw, gru_bw, ))
            self.inits.append((init_fw, init_bw, ))
            self.dropout_mask.append((mask_fw, mask_bw, ))

    def __call__(self, inputs, seq_len, keep_prob=1.0, is_train=None, concat_layers=True):
        outputs = [inputs]
        with tf.variable_scope(self.scope):
            for layer in range(self.num_layers):
                gru_fw, gru_bw = self.grus[layer]
                init_fw, init_bw = self.inits[layer]
                mask_fw, mask_bw = self.dropout_mask[layer]
                with tf.variable_scope("fw_{}".format(layer)):
                    out_fw, _ = tf.nn.dynamic_rnn(
                        gru_fw, outputs[-1] * mask_fw, seq_len, initial_state=init_fw, dtype=tf.float32)
                with tf.variable_scope("bw_{}".format(layer)):
                    inputs_bw = tf.reverse_sequence(
                        outputs[-1] * mask_bw, seq_lengths=seq_len, seq_dim=1, batch_dim=0)
                    out_bw, _ = tf.nn.dynamic_rnn(
                        gru_fw, inputs_bw, seq_len, initial_state=init_bw, dtype=tf.float32)
                    out_bw = tf.reverse_sequence(
                        out_bw, seq_lengths=seq_len, seq_dim=1, batch_dim=0)
                outputs.append(tf.concat([out_fw, out_bw], axis=2))
        if concat_layers:
            res = tf.concat(outputs[1:], axis=2)
        else:
            res = outputs[-1]
        return res

以前没有发生这种情况,因为项目结构是顺序的,而不是在控制流机制中。此外,如果我将实现改为使用native_gru类而不是cudnn_gru,它的工作原理非常好,但是因为普通的gru rnn没有针对gpu进行优化,所以速度要慢5到10倍。

0 个答案:

没有答案