Question

我一直在运行一段代码，train.lua，在这里找到：https://github.com/karpathy/char-rnn/blob/master/train.lua

这是基于SRNN / LSTM的基于字符的语言预测模型。它在使用CPU的OSX上运行得非常好，直到我尝试实现一个逐字预测模型。即，网络预测下一个单词，而不是下一个单词。词汇的数量（可能的结果）上升到13320，参数的数量也增加到39963.使用Luajit，我收到错误消息“内存不足”，我正在寻找解决方案。我发现Luajit的内存限制问题出现在这里：https://github.com/karpathy/char-rnn/issues/80

所以我移除了火炬并安装了普通的lua。但是，LUA51，LUA52和LUA53都不起作用。我遇到了同样的内存问题。每次我运行训练代码时它只会说“杀死：9”。特别是，当我使用util / model_utils.lua文件中的“model_utils.clone_many_times”函数创建T（序列长度或时间步长）隐藏层（它们共享相同的权重）时，会出现问题。

在我的情况下，该函数运行到克隆7个隐藏层的位置，并在那里杀死进程。我将rnn_size和batch_size设置为1.当然，我想运行更大的网络，但代码仍然以这么小的尺寸失败。

更新：这是我正在处理的解决方法。

克隆过程似乎有点多余，因为它存储了T个隐藏层。也许我们可以改变这个功能，它只通过T时间步骤在单元中进行激活而不是整个层。我觉得唯一的问题是支持。隐藏单元的激活级别由表格init_state_global从批次转移到批次。所以我们需要在多个批次中建立反向传播。

Answer 1

这是我找到的解决方法。其他一切都相同，我得到的结果几乎与原来的结果相同，除了一些浮动精度误差由于某种原因。它节省了内存（seq_length甚至不影响内存大小）。我将“model_utils.clone_many_times”函数中的克隆数设置为1（因此我们可能甚至不再需要这种耗费内存的函数），只需将隐藏单元激活存储为backprop。

function feval(x)
if x ~= params then
    params:copy(x)
end
grad_params:zero()

------------------ get minibatch -------------------
local x, y = loader:next_batch(1)
x,y = prepro(x,y) -- seq_length by batch_size tensor
------------------- forward pass -------------------
local rnn_state = {[0] = init_state_global}
local predictions = {}           -- softmax outputs
local loss = 0
local hidden_units = {}

for t=1,opt.seq_length do
    clones.rnn[1]:training() -- make sure we are in correct mode (this is cheap, sets flag)
    local lst = clones.rnn[1]:forward{x[t], unpack(rnn_state[t-1])}
    rnn_state[t] = {}
    for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
    hidden_units[t] = {}
    local j = 1
    for k = 1, #clones.rnn[1].modules do
         if clones.rnn[1].modules[k].output then
             if not (type(clones.rnn[1].modules[k].output) == 'table') then
                hidden_units[t][j] = clones.rnn[1].modules[k].output:clone() 
            else
                hidden_units[t][j] = {}
                for l=1, #clones.rnn[1].modules[k].output do
                    hidden_units[t][j][l] = clones.rnn[1].modules[k].output[l]:clone() 
                end
            end
            j = j+1

         end
    end

    predictions[t] = lst[#lst] -- last element is the prediction
    loss = loss + clones.criterion[1]:forward(predictions[t], y[t])
end
loss = loss / opt.seq_length

------------------ backward pass -------------------
-- initialize gradient at time t to be zeros (there's no influence from future)
local drnn_state = {[opt.seq_length] = clone_list(init_state, true)} -- true also zeros the clones
for t=opt.seq_length,1,-1 do
    -- backprop through loss, and softmax/linear
    local j = 1

    for k = 1, #clones.rnn[1].modules do
         if clones.rnn[1].modules[k].output then
            clones.rnn[1].modules[k].output = hidden_units[t][j]
            j = j+1
         end
    end

    local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
    table.insert(drnn_state[t], doutput_t)
    local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
    drnn_state[t-1] = {}
    for k,v in pairs(dlst) do
         for k=1, #clones.rnn[1].modules[k].output do
                hidden_units[t][j][k] = clones.rnn[1].modules[k].output:clone() 
                end
            end
            j = j+1

         end
    end

    predictions[t] = lst[#lst] -- last element is the prediction
    loss = loss + clones.criterion[1]:forward(predictions[t], y[t])
end
loss = loss / opt.seq_length
------------------ backward pass -------------------
-- initialize gradient at time t to be zeros (there's no influence from future)
local drnn_state = {[opt.seq_length] = clone_list(init_state, true)} -- true also zeros the clones
for t=opt.seq_length,1,-1 do
    -- backprop through loss, and softmax/linear
    local j = 1

    for k = 1, #clones.rnn[1].modules do
         if clones.rnn[1].modules[k].output then
            clones.rnn[1].modules[k].output = hidden_units[t][j]
            j = j+1
         end
    end

    local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
    table.insert(drnn_state[t], doutput_t)
    local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
    drnn_state[t-1] = {}
           for k = 1, #clones.rnn[1].modules do
         if clones.rnn[1].modules[k].output then
            clones.rnn[1].modules[k].output = hidden_units[t][j]
            j = j+1
         end
    end

    local doutput_t = clones.criterion[1]:backward(predictions[t], y[t])
    table.insert(drnn_state[t], doutput_t)
    local dlst = clones.rnn[1]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
    drnn_state[t-1] = {}
    for k,v in pairs(dlst) do
        if k > 1 then -- k == 1 is gradient on x, which we dont need
            -- note we do k-1 because first item is dembeddings, and then follow the 
            -- derivatives of the state, starting at index 2. I know...
            drnn_state[t-1][k-1] = v
        end
    end
end
------------------------ misc ----------------------
-- transfer final state to initial state (BPTT)
init_state_global = rnn_state[#rnn_state] -- NOTE: I don't think this needs to be a clone, right?
-- grad_params:div(opt.seq_length) -- this line should be here but since we use rmsprop it would have no effect. Removing for efficiency
-- clip gradient element-wise
--Lets not clip gradient this time grad_params:clamp(-opt.grad_clip, opt.grad_clip)
return loss, grad_params
end

luajit / lua5.1 / lua5.2 / lua5.3 RNN代码的内存问题

1 个答案: