Question

我尝试通过使用包含RNN层的RNNcell来实现嵌套RNN。在这种情况下，我将GRU层放在RNNcell中。可以训练模型并正常保存模型，但是当我使用GPU加载模型时，会引发“在运行期间没有节点设备共置处于活动状态”错误。但是，如果我使用CPU加载同一文件，则它可以成功加载和重新训练。因此，该模型只能在GPU上训练一次，之后仅需要使用CPU进行加载或训练。

ENV：

Tensorflow-GPU 1.15.0
CUDA V10.0.130
python 3.6.5
Keras 2.3.1
Keras-Applications 1.0.8
Keras预处理1.1.0

这些是我尝试过的。

仅加载重量（失败）
使用gpu加载整个模型（失败）
使用cpu（成功）加载整个模型
使用gpu加载json模型+权重（失败）
使用cpu（成功）加载json模型+权重

import numpy as np
import keras
import tensorflow as tf
from keras.layers import Input, GRU, RNN
from keras.models import Model, load_model, model_from_json
import keras.backend as K
import time



lookback = 10
input_feature = 7
latent_dim = 9
number_of_point = 10

# simple nested Rnn cell
class SystemCell(keras.layers.Layer):

    def __init__(self, latent,size=7, **kwargs):

        super(SystemCell, self).__init__(**kwargs)
        self.state_size = size

        self.latent = latent
        self.gru_layer = GRU(self.latent, activation='relu', return_state=False, return_sequences=True, name='GRU_Inside')


    def call(self, inputs, states):
        prev_action = states[0]

        # state representation
        input = K.reshape(inputs,(-1,10,7))
        gru_state = self.gru_layer(input)

        # Debug

        output = prev_action
        output2 = gru_state
        return output2, [output]


    def get_config(self):
        config = super(SystemCell, self).get_config()
        config['latent'] = self.latent
        return config

# some random tasting loss
def los(y_true, y_pred):
    return tf.reduce_sum(tf.reduce_sum(y_pred,axis=-1),-1) - y_true

# testing data
output = np.ones(shape=(2,10))
data = np.ones(shape=(2,10,input_feature*lookback))


# '''
######################################
# create model, train, and save model
######################################
with tf.device('/gpu:0'):
    # set up model
    in1 = Input(shape=(number_of_point,input_feature*lookback))
    out = RNN(SystemCell(latent_dim), return_sequences=True)(in1)
    M =Model(inputs=[in1], outputs=[out])
    M.compile(keras.optimizers.Adam(learning_rate=0.001), loss=los)

    # train
    t = time.time()
    M.fit(x=data, y=output, batch_size=2, epochs=10)
    print(time.time()-t)

    #############################################
    # save model
    #############################################
    M.save('testgru.h5')
    model_json = M.to_json()
    with open("testgru.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    M.save_weights("testgru_weight.h5")
    print("Saved model to disk")
    tf.summary.FileWriter('./log/', tf.get_default_graph())
    #'''


# '''
# del M
#################################
# case1 load only weight
#################################

with tf.device('/gpu:0'):
    '''
    in1 = Input(shape=(number_of_point,input_feature*look))
    out = RNN(SystemCell(latent_dim), return_sequences=True)(in1)

    M =Model(inputs=[in1], outputs=[out])
    M.compile(keras.optimizers.Adam(learning_rate=0.001), loss=los)
    M.load_weights("testgru_weight.h5")
    M.fit(x=data, y=output, batch_size=2, epochs=100)
    M.save('testgru2.h5')
    # '''
    ##############
    # error log
    ##############
    '''
    [[node rnn_2/while/GRU_Inside/while/Identity_1 (defined at D:\Program Files\Python36\lib\site-packages\tensorflow_core\python\framework\ops.py:1748) ]]Additional information about colocations:No node-device colocations were active during op 'rnn_2/while/GRU_Inside/while/Identity_1' creation.
Device assignments active during op 'rnn_2/while/GRU_Inside/while/Identity_1' creation:
  with tf.device(/gpu:0):
    '''

#################################
# case2 load whole model using gpu
#################################
with tf.device('/gpu:0'):
    # '''
    ob = {'SystemCell': SystemCell,
          'los': los}
    M = load_model('testgru.h5', ob)
    M.fit(x=data, y=output, batch_size=2, epochs=100)
    M.save('testgru2.h5')

    # error log ( same as case1)
    # '''

#################################
# case3 load whole model using cpu (success)
#################################
with tf.device('/cpu:0'):
    '''
    ob = {'SystemCell': SystemCell,
          'los': los}
    M = load_model('testgru.h5', ob)
    M.fit(x=data, y=output, batch_size=2, epochs=10)
    M.save('testgru2.h5')
    # '''

#################################
# case3 load json model + weight using gpu
#################################
with tf.device('/gpu:0'):
    '''
    ob = {'SystemCell': SystemCell}

    json_file = open('testgru.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    M = model_from_json(loaded_model_json, custom_objects=ob)
    M.compile(keras.optimizers.Adam(learning_rate=0.001), loss=los)
    M.load_weights("testgru_weight.h5")
    M.fit(x=data, y=output, batch_size=2, epochs=10)
    M.save('testgru2.h5')
    #'''

    # error log same as case 1

#################################
# case3 load json model + weight using cpu (success)
#################################
with tf.device('/cpu:0'):
    '''
    ob = {'SystemCell': SystemCell}

    json_file = open('testgru.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    M = model_from_json(loaded_model_json, custom_objects=ob)
    M.compile(keras.optimizers.Adam(learning_rate=0.001), loss=los)
    M.load_weights("testgru_weight.h5")
    M.fit(x=data, y=output, batch_size=2, epochs=10)
    M.save('testgru2.h5')
    # '''

Keras嵌套的RNN模型只能在GPU上训练一次，保存模型并重新加载后，它只能在CPU上装载或训练

0 个答案: