我一直在运行一段代码,train.lua,在这里找到:https://github.com/karpathy/char-rnn/blob/master/train.lua
这是基于SRNN / LSTM的基于字符的语言预测模型。它在使用CPU的OSX上运行得非常好,直到我尝试实现一个逐字预测模型。即,网络预测下一个单词,而不是下一个单词。词汇的数量(可能的结果)上升到13320,参数的数量也增加到39963.使用Luajit,我收到错误消息“内存不足”,我正在寻找解决方案。我发现Luajit的内存限制问题出现在这里:https://github.com/karpathy/char-rnn/issues/80
所以我移除了火炬并安装了普通的lua。但是,LUA51,LUA52和LUA53都不起作用。我遇到了同样的内存问题。每次我运行训练代码时它只会说“杀死:9”。特别是,当我使用util / model_utils.lua文件中的“model_utils.clone_many_times”函数创建T(序列长度或时间步长)隐藏层(它们共享相同的权重)时,会出现问题。
在我的情况下,该函数运行到克隆7个隐藏层的位置,并在那里杀死进程。我将rnn_size和batch_size设置为1.当然,我想运行更大的网络,但代码仍然以这么小的尺寸失败。
更新: 这是我正在处理的解决方法。
克隆过程似乎有点多余,因为它存储了T个隐藏层。也许我们可以改变这个功能,它只通过T时间步骤在单元中进行激活而不是整个层。我觉得唯一的问题是支持。隐藏单元的激活级别由表格init_state_global
从批次转移到批次。所以我们需要在多个批次中建立反向传播。
答案 0 :(得分:0)
这是我找到的解决方法。其他一切都相同,我得到的结果几乎与原来的结果相同,除了一些浮动精度误差由于某种原因。它节省了内存(seq_length甚至不影响内存大小)。我将“model_utils.clone_many_times”函数中的克隆数设置为1(因此我们可能甚至不再需要这种耗费内存的函数),只需将隐藏单元激活存储为backprop。
function feval(x)
if x ~= params then
params:copy(x)
end
grad_params:zero()
------------------ get minibatch -------------------
local x, y = loader:next_batch(1)
x,y = prepro(x,y) -- seq_length by batch_size tensor
------------------- forward pass -------------------
local rnn_state = {[0] = init_state_global}
local predictions = {} -- softmax outputs
local loss = 0
local hidden_units = {}
for t=1,opt.seq_length do
clones.rnn[1]:training() -- make sure we are in correct mode (this is cheap, sets flag)
local lst = clones.rnn[1]:forward{x[t], unpack(rnn_state[t-1])}
rnn_state[t] = {}
for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
hidden_units[t] = {}
local j = 1
for k = 1, #clones.rnn[1].modules do
if clones.rnn[1].modules[k].output then
if not (type(clones.rnn[1].modules[k].output) == 'table') then
hidden_units[t][j] = clones.rnn[1].modules[k].output:clone()
else
hidden_units[t][j] = {}
for l=1, #clones.rnn[1].modules[k].output do
hidden_units[t][j][l] = clones.rnn[1].modules[k].output[l]:clone()
end
end
j = j+1
end
end
predictions[t] = lst[#lst] -- last element is the prediction
loss = loss + clones.criterion[1]:forward(predictions[t], y[t])
end
loss = loss / opt.seq_length
------------------ backward pass -------------------
-- initialize gradient at time t to be zeros (there's no influence from future)
local drnn_state = {[opt.seq_length] = clone_list(init_state, true)} -- true also zeros the clones
for t=opt.seq_length,1,-1 do
-- backprop through loss, and softmax/linear
local j = 1
for k = 1, #clones.rnn[1].modules do
if clones.rnn[1].modules[k].output then
clones.rnn[1].modules[k].output = hidden_units[t][j]
j = j+1
end
end
local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
table.insert(drnn_state[t], doutput_t)
local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
drnn_state[t-1] = {}
for k,v in pairs(dlst) do
for k=1, #clones.rnn[1].modules[k].output do
hidden_units[t][j][k] = clones.rnn[1].modules[k].output:clone()
end
end
j = j+1
end
end
predictions[t] = lst[#lst] -- last element is the prediction
loss = loss + clones.criterion[1]:forward(predictions[t], y[t])
end
loss = loss / opt.seq_length
------------------ backward pass -------------------
-- initialize gradient at time t to be zeros (there's no influence from future)
local drnn_state = {[opt.seq_length] = clone_list(init_state, true)} -- true also zeros the clones
for t=opt.seq_length,1,-1 do
-- backprop through loss, and softmax/linear
local j = 1
for k = 1, #clones.rnn[1].modules do
if clones.rnn[1].modules[k].output then
clones.rnn[1].modules[k].output = hidden_units[t][j]
j = j+1
end
end
local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
table.insert(drnn_state[t], doutput_t)
local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
drnn_state[t-1] = {}
for k = 1, #clones.rnn[1].modules do
if clones.rnn[1].modules[k].output then
clones.rnn[1].modules[k].output = hidden_units[t][j]
j = j+1
end
end
local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
table.insert(drnn_state[t], doutput_t)
local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
drnn_state[t-1] = {}
for k,v in pairs(dlst) do
if k > 1 then -- k == 1 is gradient on x, which we dont need
-- note we do k-1 because first item is dembeddings, and then follow the
-- derivatives of the state, starting at index 2. I know...
drnn_state[t-1][k-1] = v
end
end
end
------------------------ misc ----------------------
-- transfer final state to initial state (BPTT)
init_state_global = rnn_state[#rnn_state] -- NOTE: I don't think this needs to be a clone, right?
-- grad_params:div(opt.seq_length) -- this line should be here but since we use rmsprop it would have no effect. Removing for efficiency
-- clip gradient element-wise
--Lets not clip gradient this time grad_params:clamp(-opt.grad_clip, opt.grad_clip)
return loss, grad_params
end