我一直在尝试复制alexgraves手写合成模型,并且我使用tensorflow和在带cuda的1080Ti GPU上的python做到了这一点,
我准确地复制了论文中解释的所有功能,甚至将相应的渐变值剪裁到适当的位置,但是我在训练它时确实遇到了困难。
我还按照论文中说明的方式对数据进行了预处理,包括对X和y偏移量进行归一化,但是问题是,训练通常不能将负对数可能性降低到1000以上,在本文中达到- 1000,然后我看到NaN权重。
我要做的唯一一件事是在每个笔画的条件概率上加上0.0000001,以防止NaN值出现对数似然。
有关此任务的任何提示或建议或经验吗?
这是我使用的单元格代码,
class Custom_Cell(RNNCell):
def __init__(self,forget_bias,bias,one_hot_vector, hidden_layer_nums=[700,700,700], mixture_num=10, attention_num=4):
self.bias = bias
self.lstms = []
for i in hidden_layer_nums:
self.lstms.append(LSTMCell(num_units=i, initializer=tf.truncated_normal_initializer(0.075), dtype=tf.float32, forget_bias=forget_bias))
self.attention_num = attention_num
self.mixture_num = mixture_num
self.state_size = 2*sum(hidden_layer_nums) + 3*self.attention_num
self.attention_var_num = 3*self.attention_num
self.output_size = 6*self.mixture_num + 1 + 1
self.one_hot_vector = one_hot_vector
self.lstm_num = len(hidden_layer_nums)
self.hidden_layer_nums = hidden_layer_nums
temp_shape = self.one_hot_vector.shape
self.char_num = temp_shape[2]
self.i_to_h = []
self.w_to_h = []
self.h_to_h = []
self.prev_h_to_h = []
self.lstm_bias = []
self.lstm_to_attention_weights = tf.get_variable("lstms/first_to_attention_mtrx",shape=[hidden_layer_nums[0],self.attention_var_num],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
self.lstm_to_attention_bias = tf.get_variable("lstms/first_to_attention_bias",shape=[self.attention_var_num],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
self.all_to_output_mtrx = []
for i in range(self.lstm_num):
self.all_to_output_mtrx.append( tf.get_variable("lstms/to_output_mtrx_" + str(i), shape=[hidden_layer_nums[i],self.output_size-1],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
self.all_to_output_bias = tf.get_variable("lstms/output_bias",shape=[self.output_size-1],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
for i in range(self.lstm_num):
self.i_to_h.append(tf.get_variable("lstms/i_to_h_"+str(i),shape=[3,hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
self.w_to_h.append(tf.get_variable("lstms/w_to_h_"+str(i),shape=[self.char_num,hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
self.h_to_h.append(tf.get_variable("lstms/h_to_h_"+str(i),shape=[hidden_layer_nums[i],hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
self.lstm_bias.append(tf.get_variable("lstms/bias_" + str(i),shape=[hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
if not i == 0:
self.prev_h_to_h.append(
tf.get_variable("lstms/prev_h_to_h_" + str(i), shape=[hidden_layer_nums[i-1], hidden_layer_nums[i]],
dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.075),
trainable=True))
def __call__(self, inputs, state, scope=None):
# Extracting previous configuration and vectors
splitarray = []
for i in self.hidden_layer_nums:
splitarray.append(i)
splitarray.append(i)
splitarray.append(3*self.attention_num)
splitted = tf.split(state,splitarray,axis=1)
prev_tuples = []
for i in range(self.lstm_num):
newtuple = LSTMStateTuple(splitted[2*i],splitted[2*i + 1])
prev_tuples.append(newtuple)
prev_attention_vec = splitted[2*self.lstm_num]
new_attention_vec = 0
next_states = []
most_attended = 0
last_output = 0
for i in range(self.lstm_num):
prev_c, prev_h = prev_tuples[i]
cell = self.lstms[i]
if i == 0:
with tf.name_scope("layer_1"):
w, most_attended = self.gaussian_attention(self.one_hot_vector,prev_attention_vec)
input_vec = tf.matmul(inputs,self.i_to_h[0]) + tf.matmul(prev_h,self.h_to_h[0]) + tf.matmul(w,self.w_to_h[0]) + self.lstm_bias[0]
_, new_state = cell(input_vec, prev_tuples[0])
new_c, new_h = new_state
next_states.append(new_c)
next_states.append(new_h)
last_output = tf.matmul(new_h,self.all_to_output_mtrx[0])
with tf.name_scope("attention_layer"):
temp_attention = tf.matmul(new_h,self.lstm_to_attention_weights) + self.lstm_to_attention_bias
new_alpha, new_beta, new_kappa = tf.split(temp_attention,[self.attention_num,self.attention_num,self.attention_num],axis=1)
old_alpha, old_beta, old_kappa = tf.split(prev_attention_vec,[self.attention_num,self.attention_num,self.attention_num], axis=1)
new_alpha = tf.exp(new_alpha)
new_beta = tf.exp(new_beta)
new_kappa = tf.exp(new_kappa) + old_kappa
new_attention_vec = tf.concat([new_alpha,new_beta,new_kappa],axis=1)
else:
with tf.name_scope("layer_" + str(i)):
w, most_attended = self.gaussian_attention(self.one_hot_vector,new_attention_vec)
input_vec = tf.matmul(inputs,self.i_to_h[i]) + tf.matmul(next_states[-1],self.prev_h_to_h[i-1]) + tf.matmul(prev_h,self.h_to_h[i]) + tf.matmul(w,self.w_to_h[i]) + self.lstm_bias[i]
_,new_state = cell(input_vec,prev_tuples[i])
new_c, new_h = new_state
next_states.append(new_c)
next_states.append(new_h)
last_output = last_output + tf.matmul(new_h, self.all_to_output_mtrx[i])
with tf.name_scope("output"):
last_output = last_output + self.all_to_output_bias
next_states.append(new_attention_vec)
state_to_return = tf.concat(next_states,axis=1)
output_split_param = [1,self.mixture_num,2*self.mixture_num,2*self.mixture_num,self.mixture_num]
binomial_param, pi, mu, sigma, rho = tf.split(last_output,output_split_param,axis=1)
binomial_param = tf.divide(1.,1.+tf.exp(binomial_param))
pi = tf.nn.softmax(tf.multiply(pi,1.+self.bias),axis=1)
mu = mu
sigma = tf.exp(sigma-self.bias)
rho = tf.tanh(rho)
output_to_return = tf.concat([most_attended, binomial_param, pi, mu, sigma, rho],axis=1)
return output_to_return, state_to_return
def state_size(self):
return self.state_size
def output_size(self):
return self.output_size
def gaussian_attention(self,sequence,params):
with tf.name_scope("attention_calculation"):
alpha, beta, kappa = tf.split(params,[self.attention_num,self.attention_num,self.attention_num],axis=1)
seq_shape = sequence.shape
seq_length = seq_shape[1]
temp_vec = 20*np.asarray(range(seq_length),dtype=float)
final_result = 0
alpha = tf.split(alpha,self.attention_num,1)
beta = tf.split(beta,self.attention_num,1)
kappa = tf.split(kappa,self.attention_num,1)
for i in range(self.attention_num):
alpha_now = alpha[i]
beta_now = beta[i]
kappa_now = kappa[i]
result = kappa_now - temp_vec
result = tf.multiply(tf.square(result),tf.negative(beta_now))
result = tf.multiply(tf.exp(result),alpha_now)
final_result = final_result+result
most_attended = tf.argmax(final_result,axis=1)
most_attended = tf.reshape(tf.cast(most_attended,dtype=tf.float32),shape=[-1,1])
final_result = tf.tile(tf.reshape(final_result,[-1,seq_shape[1],1]),[1,1,seq_shape[2]])
to_return = tf.reduce_sum(tf.multiply(final_result,sequence),axis=1)
return to_return, most_attended
这是有损失的网络:
`to_write_one_hot = tf.placeholder(dtype=tf.float32,shape=(None,line_length,dict_length))
sequence = tf.placeholder(dtype=tf.float32,shape=(None,None,3))
sequence_shift = tf.placeholder(dtype=tf.float32,shape=(None,None,3))
bias = tf.placeholder(shape=[1],dtype=tf.float32)
sequence_length = tf.placeholder(shape=(None),dtype=tf.int32)
forget_bias_placeholder = tf.placeholder(shape=(None),dtype=tf.float32)
graves_cell = Custom_Cell(forget_bias=1,one_hot_vector=to_write_one_hot,hidden_layer_nums=hidden_layer_nums,mixture_num=mixture_num,bias=bias,attention_num=attention_num)
output, state = tf.nn.dynamic_rnn(graves_cell,sequence,dtype=tf.float32,sequence_length=sequence_length)
with tf.name_scope("loss_layer"):
mask = tf.sign(tf.reduce_max(tf.abs(output), 2))
most_attended, binomial_param, pi, mu, sigma, rho = tf.split(output,[1,1,mixture_num,2*mixture_num,2*mixture_num,mixture_num], axis=2)
pi = tf.split(pi,mixture_num,axis=2)
mu = tf.split(mu,mixture_num,axis=2)
sigma = tf.split(sigma,mixture_num,axis=2)
rho = tf.split(rho,mixture_num,axis=2)
negative_log_likelihood = 0
probability = 0
x1, x2, e = tf.split(sequence_shift,3,axis=2)
for i in range(mixture_num):
pi_now = pi[i]
mu_now = tf.split(mu[i],2,axis=2)
mu_1 = mu_now[0]
mu_2 = mu_now[1]
sigma_now = tf.split(sigma[i],2,axis=2)
sigma_1 = sigma_now[0] + (1-tf.reshape(mask, [-1,max_len,1]))
sigma_2 = sigma_now[1] + (1-tf.reshape(mask, [-1,max_len,1]))
rho_now = rho[i]
Z = tf.divide(tf.square(x1-mu_1),tf.square(sigma_1)) + tf.divide(tf.square(x2-mu_2),tf.square(sigma_2)) - tf.divide(tf.multiply(tf.multiply(x1-mu_1,x2-mu_2),2*rho_now),tf.multiply(sigma_1,sigma_2))
prob = tf.exp(tf.div(tf.negative(Z),2*(1-tf.square(rho_now))))
Normalizing_factor = 2*np.pi*tf.multiply(sigma_1,sigma_2)
Normalizing_factor = tf.multiply(Normalizing_factor,tf.sqrt(1-tf.square(rho_now)))
prob = tf.divide(prob,Normalizing_factor)
prob = tf.multiply(pi_now,prob)
probability = probability + prob
binomial_likelihood = tf.multiply(binomial_param,e) + tf.multiply(1-binomial_param,1-e)
probability = tf.multiply(probability,binomial_likelihood)
probability = probability + (1-tf.reshape(mask,[-1,max_len,1]))
temp_tensor = tf.multiply(mask, tf.log(tf.reshape(probability,[-1,max_len]) + mask*0.00001))
negative_log_likelihood_0 = tf.negative(tf.reduce_sum(temp_tensor,axis=1))
negative_log_likelihood_1 = tf.divide(negative_log_likelihood_0,tf.reshape(tf.cast(sequence_length, dtype=tf.float32), shape=[-1,1]))
negative_log_likelihood_1 = tf.reduce_mean(negative_log_likelihood_1)
tf.summary.scalar("average_per_timestamp_log_likelihood", negative_log_likelihood_1)
negative_log_likelihood = tf.reduce_mean(negative_log_likelihood_0)
with tf.name_scope("train_op"):
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001,momentum=0.9, decay=0.95,epsilon=0.0001)
gvs = optimizer.compute_gradients(negative_log_likelihood)
capped_gvs = []
for grad, var in gvs:
if var.name.__contains__("rnn"):
capped_gvs.append((tf.clip_by_value(grad,-10,10),var))
else:
capped_gvs.append((tf.clip_by_value(grad,-100,100),var))
train_op = optimizer.apply_gradients(capped_gvs)
`
编辑1。我发现我以错误的方式裁剪了梯度,正确的方法是引入一个新的“ op”,如https://github.com/tensorflow/tensorflow/issues/2793所述,它仅裁剪了整个网络和lstm单元的输出渐变。
@tf.custom_gradient
def clip_gradient(x, clip):
def grad(dresult):
return [tf.clip_by_norm(dresult, clip)]
return x, grad
在代码中添加以上几行,并对要在反向传播中剪切渐变的任何变量使用该函数!
我仍然应该看到我的结果。
编辑2。 更改后的型号代码为:
from tensorflow.contrib.rnn import RNNCell
from tensorflow.contrib.rnn import LSTMCell
from tensorflow.contrib.rnn import LSTMStateTuple
import tensorflow as tf
import numpy as np
@tf.custom_gradient
def clip_gradient_lstm(x):
def grad(dresult):
return [tf.clip_by_value(dresult,-10,10)]
return x, grad
@tf.custom_gradient
def clip_gradient_output(x):
def grad(dresult):
return [tf.clip_by_value(dresult,-100,100)]
return x, grad
def length_of(seq):
used = tf.sign(tf.reduce_max(tf.abs(seq),axis=2))
length = tf.reduce_sum(used,1)
length = tf.cast(length,tf.int32)
return length
class Custom_Cell(RNNCell):
def __init__(self,forget_bias,bias,one_hot_vector, hidden_layer_nums=[700,700,700], mixture_num=10, attention_num=4):
self.bias = bias
self.lstms = []
for i in hidden_layer_nums:
self.lstms.append(LSTMCell(num_units=i, initializer=tf.truncated_normal_initializer(0.075), dtype=tf.float32, forget_bias=forget_bias))
self.attention_num = attention_num
self.mixture_num = mixture_num
self.state_size = 2*sum(hidden_layer_nums) + 3*self.attention_num
self.attention_var_num = 3*self.attention_num
self.output_size = 6*self.mixture_num + 1 + 1
self.one_hot_vector = one_hot_vector
self.lstm_num = len(hidden_layer_nums)
self.hidden_layer_nums = hidden_layer_nums
temp_shape = self.one_hot_vector.shape
self.char_num = temp_shape[2]
self.i_to_h = []
self.w_to_h = []
self.h_to_h = []
self.prev_h_to_h = []
self.lstm_bias = []
self.lstm_to_attention_weights = tf.get_variable("lstms/first_to_attention_mtrx",shape=[hidden_layer_nums[0],self.attention_var_num],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
self.lstm_to_attention_bias = tf.get_variable("lstms/first_to_attention_bias",shape=[self.attention_var_num],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
self.all_to_output_mtrx = []
for i in range(self.lstm_num):
self.all_to_output_mtrx.append( tf.get_variable("lstms/to_output_mtrx_" + str(i), shape=[hidden_layer_nums[i],self.output_size-1],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
self.all_to_output_bias = tf.get_variable("lstms/output_bias",shape=[self.output_size-1],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
for i in range(self.lstm_num):
self.i_to_h.append(tf.get_variable("lstms/i_to_h_"+str(i),shape=[3,hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
self.w_to_h.append(tf.get_variable("lstms/w_to_h_"+str(i),shape=[self.char_num,hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
self.h_to_h.append(tf.get_variable("lstms/h_to_h_"+str(i),shape=[hidden_layer_nums[i],hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
self.lstm_bias.append(tf.get_variable("lstms/bias_" + str(i),shape=[hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
if not i == 0:
self.prev_h_to_h.append(
tf.get_variable("lstms/prev_h_to_h_" + str(i), shape=[hidden_layer_nums[i-1], hidden_layer_nums[i]],
dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.075),
trainable=True))
def __call__(self, inputs, state, scope=None):
# Extracting previous configuration and vectors
splitarray = []
for i in self.hidden_layer_nums:
splitarray.append(i)
splitarray.append(i)
splitarray.append(3*self.attention_num)
splitted = tf.split(state,splitarray,axis=1)
prev_tuples = []
for i in range(self.lstm_num):
newtuple = LSTMStateTuple(splitted[2*i],splitted[2*i + 1])
prev_tuples.append(newtuple)
prev_attention_vec = splitted[2*self.lstm_num]
new_attention_vec = 0
next_states = []
most_attended = 0
last_output = 0
for i in range(self.lstm_num):
prev_c, prev_h = prev_tuples[i]
cell = self.lstms[i]
if i == 0:
with tf.name_scope("layer_1"):
w, most_attended = self.gaussian_attention(self.one_hot_vector,prev_attention_vec)
input_vec = tf.matmul(inputs,self.i_to_h[0]) + tf.matmul(prev_h,self.h_to_h[0]) + tf.matmul(w,self.w_to_h[0]) + self.lstm_bias[0]
_, new_state = cell(input_vec, prev_tuples[0])
new_c, new_h = new_state
new_h = clip_gradient_lstm(new_h)
next_states.append(new_c)
next_states.append(new_h)
last_output = tf.matmul(new_h,self.all_to_output_mtrx[0])
with tf.name_scope("attention_layer"):
temp_attention = tf.matmul(new_h,self.lstm_to_attention_weights) + self.lstm_to_attention_bias
new_alpha, new_beta, new_kappa = tf.split(temp_attention,[self.attention_num,self.attention_num,self.attention_num],axis=1)
old_alpha, old_beta, old_kappa = tf.split(prev_attention_vec,[self.attention_num,self.attention_num,self.attention_num], axis=1)
new_alpha = tf.exp(new_alpha)
new_beta = tf.exp(new_beta)
new_kappa = tf.exp(new_kappa) + old_kappa
new_attention_vec = tf.concat([new_alpha,new_beta,new_kappa],axis=1)
else:
with tf.name_scope("layer_" + str(i)):
w, most_attended = self.gaussian_attention(self.one_hot_vector,new_attention_vec)
input_vec = tf.matmul(inputs,self.i_to_h[i]) + tf.matmul(next_states[-1],self.prev_h_to_h[i-1]) + tf.matmul(prev_h,self.h_to_h[i]) + tf.matmul(w,self.w_to_h[i]) + self.lstm_bias[i]
_,new_state = cell(input_vec,prev_tuples[i])
new_c, new_h = new_state
new_h = clip_gradient_lstm(new_h)
next_states.append(new_c)
next_states.append(new_h)
last_output = last_output + tf.matmul(new_h, self.all_to_output_mtrx[i])
with tf.name_scope("output"):
last_output = last_output + self.all_to_output_bias
last_output = clip_gradient_output(last_output)
next_states.append(new_attention_vec)
state_to_return = tf.concat(next_states,axis=1)
output_split_param = [1,self.mixture_num,2*self.mixture_num,2*self.mixture_num,self.mixture_num]
binomial_param, pi, mu, sigma, rho = tf.split(last_output,output_split_param,axis=1)
binomial_param = tf.divide(1.,1.+tf.exp(binomial_param))
pi = tf.nn.softmax(tf.multiply(pi,1.+self.bias),axis=1)
mu = mu
sigma = tf.exp(sigma-self.bias)
rho = tf.tanh(rho)
output_to_return = tf.concat([most_attended, binomial_param, pi, mu, sigma, rho],axis=1)
return output_to_return, state_to_return
def state_size(self):
return self.state_size
def output_size(self):
return self.output_size
def gaussian_attention(self,sequence,params):
with tf.name_scope("attention_calculation"):
alpha, beta, kappa = tf.split(params,[self.attention_num,self.attention_num,self.attention_num],axis=1)
seq_shape = sequence.shape
seq_length = seq_shape[1]
temp_vec = np.asarray(range(seq_length),dtype=float)
final_result = 0
alpha = tf.split(alpha,self.attention_num,1)
beta = tf.split(beta,self.attention_num,1)
kappa = tf.split(kappa,self.attention_num,1)
for i in range(self.attention_num):
alpha_now = alpha[i]
beta_now = beta[i]
kappa_now = kappa[i]
result = kappa_now - temp_vec
result = tf.multiply(tf.square(result),tf.negative(beta_now))
result = tf.multiply(tf.exp(result),alpha_now)
final_result = final_result+result
most_attended = tf.argmax(final_result,axis=1)
most_attended = tf.reshape(tf.cast(most_attended,dtype=tf.float32),shape=[-1,1])
final_result = tf.tile(tf.reshape(final_result,[-1,seq_shape[1],1]),[1,1,seq_shape[2]])
to_return = tf.reduce_sum(tf.multiply(final_result,sequence),axis=1)
return to_return, most_attended
培训由
完成 with tf.name_scope("train_op"):
optimizer =
tf.train.RMSPropOptimizer(learning_rate=0.0001,momentum=0.9, decay=0.95,epsilon=0.0001,centered=True)
train_op = optimizer.minimize(negative_log_likelihood)
现在仍在训练中,但现在低至-10。