luajit / lua5.1 / lua5.2 / lua5.3 RNN代码的内存问题

时间:2016-07-22 09:38:28

标签: lua torch lstm recurrent-neural-network

我一直在运行一段代码,train.lua,在这里找到:https://github.com/karpathy/char-rnn/blob/master/train.lua

这是基于SRNN / LSTM的基于字符的语言预测模型。它在使用CPU的OSX上运行得非常好,直到我尝试实现一个逐字预测模型。即,网络预测下一个单词,而不是下一个单词。词汇的数量(可能的结果)上升到13320,参数的数量也增加到39963.使用Luajit,我收到错误消息“内存不足”,我正在寻找解决方案。我发现Luajit的内存限制问题出现在这里:https://github.com/karpathy/char-rnn/issues/80

所以我移除了火炬并安装了普通的lua。但是,LUA51,LUA52和LUA53都不起作用。我遇到了同样的内存问题。每次我运行训练代码时它只会说“杀死:9”。特别是,当我使用util / model_utils.lua文件中的“model_utils.clone_many_times”函数创建T(序列长度或时间步长)隐藏层(它们共享相同的权重)时,会出现问题。

在我的情况下,该函数运行到克隆7个隐藏层的位置,并在那里杀死进程。我将rnn_size和batch_size设置为1.当然,我想运行更大的网络,但代码仍然以这么小的尺寸失败。

更新: 这是我正在处理的解决方法。

克隆过程似乎有点多余,因为它存储了T个隐藏层。也许我们可以改变这个功能,它只通过T时间步骤在单元中进行激活而不是整个层。我觉得唯一的问题是支持。隐藏单元的激活级别由表格init_state_global从批次转移到批次。所以我们需要在多个批次中建立反向传播。

1 个答案:

答案 0 :(得分:0)

这是我找到的解决方法。其他一切都相同,我得到的结果几乎与原来的结果相同,除了一些浮动精度误差由于某种原因。它节省了内存(seq_length甚至不影响内存大小)。我将“model_utils.clone_many_times”函数中的克隆数设置为1(因此我们可能甚至不再需要这种耗费内存的函数),只需将隐藏单元激活存储为backprop。

function feval(x)
if x ~= params then
    params:copy(x)
end
grad_params:zero()

------------------ get minibatch -------------------
local x, y = loader:next_batch(1)
x,y = prepro(x,y) -- seq_length by batch_size tensor
------------------- forward pass -------------------
local rnn_state = {[0] = init_state_global}
local predictions = {}           -- softmax outputs
local loss = 0
local hidden_units = {}

for t=1,opt.seq_length do
    clones.rnn[1]:training() -- make sure we are in correct mode (this is cheap, sets flag)
    local lst = clones.rnn[1]:forward{x[t], unpack(rnn_state[t-1])}
    rnn_state[t] = {}
    for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
    hidden_units[t] = {}
    local j = 1
    for k = 1, #clones.rnn[1].modules do
         if clones.rnn[1].modules[k].output then
             if not (type(clones.rnn[1].modules[k].output) == 'table') then
                hidden_units[t][j] = clones.rnn[1].modules[k].output:clone() 
            else
                hidden_units[t][j] = {}
                for l=1, #clones.rnn[1].modules[k].output do
                    hidden_units[t][j][l] = clones.rnn[1].modules[k].output[l]:clone() 
                end
            end
            j = j+1

         end
    end

    predictions[t] = lst[#lst] -- last element is the prediction
    loss = loss + clones.criterion[1]:forward(predictions[t], y[t])
end
loss = loss / opt.seq_length

------------------ backward pass -------------------
-- initialize gradient at time t to be zeros (there's no influence from future)
local drnn_state = {[opt.seq_length] = clone_list(init_state, true)} -- true also zeros the clones
for t=opt.seq_length,1,-1 do
    -- backprop through loss, and softmax/linear
    local j = 1

    for k = 1, #clones.rnn[1].modules do
         if clones.rnn[1].modules[k].output then
            clones.rnn[1].modules[k].output = hidden_units[t][j]
            j = j+1
         end
    end

    local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
    table.insert(drnn_state[t], doutput_t)
    local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
    drnn_state[t-1] = {}
    for k,v in pairs(dlst) do
         for k=1, #clones.rnn[1].modules[k].output do
                hidden_units[t][j][k] = clones.rnn[1].modules[k].output:clone() 
                end
            end
            j = j+1

         end
    end

    predictions[t] = lst[#lst] -- last element is the prediction
    loss = loss + clones.criterion[1]:forward(predictions[t], y[t])
end
loss = loss / opt.seq_length
------------------ backward pass -------------------
-- initialize gradient at time t to be zeros (there's no influence from future)
local drnn_state = {[opt.seq_length] = clone_list(init_state, true)} -- true also zeros the clones
for t=opt.seq_length,1,-1 do
    -- backprop through loss, and softmax/linear
    local j = 1

    for k = 1, #clones.rnn[1].modules do
         if clones.rnn[1].modules[k].output then
            clones.rnn[1].modules[k].output = hidden_units[t][j]
            j = j+1
         end
    end

    local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
    table.insert(drnn_state[t], doutput_t)
    local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
    drnn_state[t-1] = {}
           for k = 1, #clones.rnn[1].modules do
         if clones.rnn[1].modules[k].output then
            clones.rnn[1].modules[k].output = hidden_units[t][j]
            j = j+1
         end
    end

    local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
    table.insert(drnn_state[t], doutput_t)
    local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
    drnn_state[t-1] = {}
    for k,v in pairs(dlst) do
        if k > 1 then -- k == 1 is gradient on x, which we dont need
            -- note we do k-1 because first item is dembeddings, and then follow the 
            -- derivatives of the state, starting at index 2. I know...
            drnn_state[t-1][k-1] = v
        end
    end
end
------------------------ misc ----------------------
-- transfer final state to initial state (BPTT)
init_state_global = rnn_state[#rnn_state] -- NOTE: I don't think this needs to be a clone, right?
-- grad_params:div(opt.seq_length) -- this line should be here but since we use rmsprop it would have no effect. Removing for efficiency
-- clip gradient element-wise
--Lets not clip gradient this time grad_params:clamp(-opt.grad_clip, opt.grad_clip)
return loss, grad_params
end