该程序读取文本文件RNNtext.txt,为所有数据创建单热矢量表示,用数据训练LSTM并不时地显示一堆采样字符。然而,即使看cost vs iterations graph,也表明它的学习效率非常低。老实说,我的LSTM的原始代码(numpy)做得更好。它不仅速度更快,而且可以产生大多数有意义的单词。这只会产生乱码。我的错误在哪里?我真的没有想法,我似乎无法找到它在逻辑上的错误。
import numpy as np
import random
import tensorflow as tf
import os
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# Reading RNNtext.txt file
direc = os.path.dirname(os.path.realpath(__file__))
data = open(direc + "/RNNtext.txt", "r").read()
# Array of unique characters
chars = list(set(data))
num_hidden = 80
iterations = 1000
display_iteration = 100 # Sample when iteration % display_iteration == 0
sample_size = 250
batch_size = 120 # batch size or the number of time steps to unroll RNN
alpha = 0.01 # Learning rate
#Vocabulary and text file sizes
vocab_size = len(chars)
data_size = len(data)
# Bijection from a unique character to an index
char_to_ix = {}
# Bijection from an index to a unique character
ix_to_char = {}
for j in range(vocab_size):
char_to_ix[chars[j]] = j
ix_to_char[j] = chars[j]
# Transforming all characters to indices
data_ix = [char_to_ix[ch] for ch in data]
train_data = [] # This will contain one-hot vectors
for k in range(data_size):
# Representing each index/character by a one-hot vector
hot1 = np.zeros((vocab_size, 1))
hot1[data_ix[k]] = 1
train_data.append(hot1)
X = tf.placeholder(tf.float32, [None, vocab_size, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, vocab_size])
cell = tf.contrib.rnn.LSTMCell(num_hidden,state_is_tuple=True)
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
output = tf.transpose(output, [1, 0, 2])
weight = tf.Variable(tf.random_normal([num_hidden, vocab_size]))
bias = tf.Variable(tf.constant(0.0, shape=[vocab_size]))
prediction = tf.matmul(output[-1], weight) + bias
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=target))
optimizer = tf.train.ProximalGradientDescentOptimizer(alpha)
minimize = optimizer.minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
ARR = [i for i in range(vocab_size)] # for extracting index by probabilities in np.random.choice()
ITER = []
COST = []
p = 0 # p will be iterated by batch_size steps
for i in range(iterations):
if p + batch_size >= data_size:
p = 0
# sweeping through data one-hot vectors
inp, out = train_data[p:p+batch_size], train_data[p+1:p+batch_size+1]
out = np.reshape(out, [-1, vocab_size])
c = sess.run(cost, {X: inp, target: out}) # calculating cost for plotting later
COST.append(c)
ITER.append(i)
sess.run(minimize, {X: inp, target: out})
# displaying sample_size number of characters with random seed
# doesn't affect training
if i % display_iteration == 0:
seed = np.random.randint(0, vocab_size)
CHARS = []
for j in range(sample_size):
x = np.zeros((vocab_size, 1))
x[seed] = 1
x = [x]
pred = sess.run(prediction, {X: x})[0]
pred = np.exp(pred) / np.sum(np.exp(pred))
pred = pred.ravel()
seed = np.random.choice(ARR, 1, p = pred)[0]
ch = ix_to_char[seed]
CHARS.append(ch)
TXT = ''.join(CHARS)
print("-------------------------------------------------")
print(TXT)
print("Iteration: ", str(i))
p += batch_size
sess.close()
plt.plot(ITER, COST)
plt.show()
编辑:添加了numpy代码进行比较
import numpy as np
import matplotlib.pyplot as plt
import os
plt.style.use('fivethirtyeight')
direc = os.path.dirname(os.path.realpath(__file__))
readFile = open(direc + "\RNNtext.txt", 'r')
data = readFile.read()
readFile.close()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(chars)
print("Vocabulary size: " + str(vocab_size))
char_to_ix = {}
ix_to_char = {}
for j in range(len(chars)):
char_to_ix[chars[j]] = j
ix_to_char[j] = chars[j]
hidden_size = 80
batch_size = 120
alpha = 0.1
sample_size = 250
iterations = 1000
display_iteration = 100
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias
def sample(hid, seed, weights, sample_size):
X = np.zeros((vocab_size, 1))
X[seed] = 1
CHARS = []
ARR = [i for i in range(vocab_size)]
for t in range(sample_size):
hid = np.tanh(np.dot(Wxh, X) + np.dot(Whh, hid) + bh)
y = np.dot(Why, hid) + by
prob = np.exp(y) / np.sum(np.exp(y))
prob = prob.ravel()
ix = np.random.choice(ARR, 1, p=prob)[0]
CHARS.append(ix_to_char[ix])
X = np.zeros((vocab_size, 1))
X[ix] = 1
TXT = ''.join(CHARS)
return TXT
LOSS = []
ITER = []
p = 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*batch_size # loss at iteration 0
hprev = np.zeros((hidden_size,1))
for i in range(iterations): ## just time passing by
dWxh = np.zeros_like(Wxh)
dWhh = np.zeros_like(Whh)
dWhy = np.zeros_like(Why)
dbh = np.zeros_like(bh)
dby = np.zeros_like(by)
if p+batch_size >= len(data) or i == 0:
hprev = np.zeros((hidden_size,1))
p = 0
inputs = [char_to_ix[ch] for ch in data[p:p+batch_size]]
targets = [char_to_ix[ch] for ch in data[p+1:p+batch_size+1]]
HID = {}
X = {}
Y = {}
P = {}
HID[-1] = np.copy(hprev)
loss = 0
##======FORWARD======##
for t in range(len(inputs)):
X[t] = np.zeros((vocab_size,1))
X[t][inputs[t]] = 1
HID[t] = np.tanh(np.dot(Wxh, X[t]) + np.dot(Whh, HID[t-1]) + bh) # inp -> X
Y[t] = np.dot(Why, HID[t]) + by # tanh
P[t] = np.exp(Y[t]) / np.sum(np.exp(Y[t]))
loss += -np.log(P[t][targets[t]][0])
dhnext = np.zeros_like(HID[0])
##======BACKPROP======##
for t in reversed(range(len(inputs))):
dy = np.copy(P[t])
dy[targets[t]] -= 1
dh = (np.dot(Why.T, dy) + dhnext)*(1-HID[t]*HID[t])
dx = np.dot(Why.T, dy)*(1 - HID[t]**2)
dWhy += np.dot(dy, HID[t].T)
dWhh += np.dot(dh, HID[t-1].T)
dWxh += np.dot(dh, X[t].T)
dby += dy
dbh += dh
dhnext = np.dot(Whh.T, dh)
##=====================##
hprev = HID[-1]
smooth_loss = smooth_loss * 0.999 + loss * 0.001
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
[dWxh, dWhh, dWhy, dbh, dby],
[mWxh, mWhh, mWhy, mbh, mby]):
mem += dparam * dparam
param += -alpha * dparam / np.sqrt(mem + 1e-8) # Adagrad
if i % display_iteration == 0:
print(str(i))
weights = [Wxh,Whh,Why,bh,by]
seed = inputs[np.random.randint(0,len(inputs))]
TXT = sample(HID[-1], seed, weights, sample_size)
print("-----------------------------------------------")
print(TXT)
print("-----------------------------------------------")
with open(direc + "\RNNout.txt", 'w') as writeFile:
writeFile.write(TXT)
ITER.append(i)
LOSS.append(loss)
p += batch_size
best_text = sample(HID[-1], inputs[0], weights, sample_size)
plt.plot(ITER, LOSS, linewidth = 1)
plt.show()
writeFile.close()
答案 0 :(得分:2)
对我而言,这看起来像一面红旗:
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
tf.nn.dynamic_rnn
的第二个输出是处理给定序列后的最新状态。看起来你明确忽略了它,而不是在sess.run(...)
的每次培训迭代中重新提供它(因此你的dynamic_rnn
没有initial_state
参数。 / p>
我强烈建议您在进一步查看之前更改代码的这一部分。
另外,我不知道你的数据是什么样的,但你的喂养和配料策略需要通过这整个状态传递练习来理解。否则,再一次,它只会产生胡言乱语。
答案 1 :(得分:0)
根据提供的信息,我建议这两个初步步骤尝试改进模型。
增加迭代次数,递归神经网络的工作方式与其他深层次结构不同,并且可能需要迭代次数的额外数量级才能解决。
玩种子:根据我的经验,为了得到有意义的序列,可能取决于所用种子的质量。