Caffe中止训练

时间:2018-04-27 23:34:02

标签: caffe conv-neural-network training-data lmdb

我正试图从头开始训练一个caffe模型(在码头工人中)。

pwd:

root@982adaaca24f:~/sharedfolder/caffe/docker/image/happyNet# 

相关文件路径:

models/
      Custom_Model/
                  deploy.prototxt
                  solver.prototxt
                  train.prototxt
datasets/
        training_set_lmdb/
                         data.mdb (5,01 GB)
                         lock.mdb
        validation_set_lmdb/
                         data.mdb (163,8 GB)
                         lock.mdb

为此我正在运行:

#~/caffe/build/tools/caffe train -solver models/Custom_Model/solver.prototxt

但是在从参数初始化net,加载平均文件,打开LMDB数据集并设置所有内容之后,我得到了一个中止过程:

    I0428 22:51:03.340870    59 caffe.cpp:178] Use CPU.
    I0428 22:51:03.343197    59 solver.cpp:48] Initializing solver from parameters: 
    test_iter: 1
    test_interval: 20
    base_lr: 0.001
    display: 10
    max_iter: 3000
    lr_policy: "fixed"
    momentum: 0.9
    snapshot: 100
    snapshot_prefix: "snapshot"
    solver_mode: CPU
    net: "models/Custom_Model/train.prototxt"
    momentum2: 0.999
    type: "Adam"
    I0428 22:51:03.348469    59 solver.cpp:91] Creating training net from net file: models/Custom_Model/train.prototxt
    I0428 22:51:03.351524    59 upgrade_proto.cpp:52] Attempting to upgrade input file specified using deprecated V1LayerParameter: models/Custom_Model/train.prototxt
    I0428 22:51:03.352391    59 upgrade_proto.cpp:60] Successfully upgraded file specified using deprecated V1LayerParameter
    I0428 22:51:03.353207    59 net.cpp:313] The NetState phase (0) differed from the phase (1) specified by a rule in layer training_test
    I0428 22:51:03.353914    59 net.cpp:49] Initializing net from parameters: 
    name: "CaffeNet"
    state {
      phase: TRAIN
    }
    layer {
      name: "training_train"
      type: "Data"
      top: "data"
      top: "label"
      include {
        phase: TRAIN
      }
      transform_param {
        mean_file: "datasets/mean_training_image.binaryproto"
      }
      data_param {
        source: "datasets/training_set_lmdb"
        batch_size: 400
        backend: LMDB
      }
    }
    layer {
      name: "conv1"
      type: "Convolution"
      bottom: "data"
      top: "conv1"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 96
        kernel_size: 7
        stride: 2
      }
    }
    layer {
      name: "relu1"
      type: "ReLU"
      bottom: "conv1"
      top: "conv1"
    }
    layer {
      name: "norm1"
      type: "LRN"
      bottom: "conv1"
      top: "norm1"
      lrn_param {
        local_size: 5
        alpha: 0.0005
        beta: 0.75
      }
    }
    layer {
      name: "pool1"
      type: "Pooling"
      bottom: "norm1"
      top: "pool1"
      pooling_param {
        pool: MAX
        kernel_size: 3
        stride: 3
      }
    }
    layer {
      name: "conv2"
      type: "Convolution"
      bottom: "pool1"
      top: "conv2"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 256
        pad: 2
        kernel_size: 5
      }
    }
    layer {
      name: "relu2"
      type: "ReLU"
      bottom: "conv2"
      top: "conv2"
    }
    layer {
      name: "pool2"
      type: "Pooling"
      bottom: "conv2"
      top: "pool2"
      pooling_param {
        pool: MAX
        kernel_size: 2
        stride: 2
      }
    }
    layer {
      name: "conv3"
      type: "Convolution"
      bottom: "pool2"
      top: "conv3"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu3"
      type: "ReLU"
      bottom: "conv3"
      top: "conv3"
    }
    layer {
      name: "conv4"
      type: "Convolution"
      bottom: "conv3"
      top: "conv4"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu4"
      type: "ReLU"
      bottom: "conv4"
      top: "conv4"
    }
    layer {
      name: "conv5"
      type: "Convolution"
      bottom: "conv4"
      top: "conv5"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu5"
      type: "ReLU"
      bottom: "conv5"
      top: "conv5"
    }
    layer {
      name: "pool5"
      type: "Pooling"
      bottom: "conv5"
      top: "pool5"
      pooling_param {
        pool: MAX
        kernel_size: 3
        stride: 3
      }
    }
    layer {
      name: "fc6"
      type: "InnerProduct"
      bottom: "pool5"
      top: "fc6"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 4048
      }
    }
    layer {
      name: "relu6"
      type: "ReLU"
      bottom: "fc6"
      top: "fc6"
    }
    layer {
      name: "drop6"
      type: "Dropout"
      bottom: "fc6"
      top: "fc6"
      dropout_param {
        dropout_ratio: 0.5
      }
    }
    layer {
      name: "fc7"
      type: "InnerProduct"
      bottom: "fc6"
      top: "fc7"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 4048
      }
    }
    layer {
      name: "relu7"
      type: "ReLU"
      bottom: "fc7"
      top: "fc7"
    }
    layer {
      name: "drop7"
      type: "Dropout"
      bottom: "fc7"
      top: "fc7"
      dropout_param {
        dropout_ratio: 0.5
      }
    }
    layer {
      name: "fc8_cat"
      type: "InnerProduct"
      bottom: "fc7"
      top: "fc8"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 6
      }
    }
    layer {
      name: "prob"
      type: "SoftmaxWithLoss"
      bottom: "fc8"
      bottom: "label"
    }
    I0428 22:51:03.356101    59 layer_factory.hpp:77] Creating layer training_train
    I0428 22:51:03.357806    59 net.cpp:91] Creating Layer training_train
    I0428 22:51:03.357897    59 net.cpp:399] training_train -> data
    I0428 22:51:03.359665    59 net.cpp:399] training_train -> label
    I0428 22:51:03.359840    59 data_transformer.cpp:25] Loading mean file from: datasets/mean_training_image.binaryproto
    I0428 22:51:03.376284    61 db_lmdb.cpp:35] Opened lmdb datasets/training_set_lmdb
    I0428 22:51:03.380998    59 data_layer.cpp:41] output data size: 400,3,224,224
    I0428 22:51:04.102387    59 net.cpp:141] Setting up training_train
    I0428 22:51:04.102494    59 net.cpp:148] Top shape: 400 3 224 224 (60211200)
    I0428 22:51:04.102694    59 net.cpp:148] Top shape: 400 (400)
    I0428 22:51:04.104347    59 net.cpp:156] Memory required for data: 240846400
    I0428 22:51:04.105435    59 layer_factory.hpp:77] Creating layer conv1
    I0428 22:51:04.107542    59 net.cpp:91] Creating Layer conv1
    I0428 22:51:04.108368    59 net.cpp:425] conv1 <- data
    I0428 22:51:04.109095    59 net.cpp:399] conv1 -> conv1
    I0428 22:51:04.109275    59 net.cpp:141] Setting up conv1
    I0428 22:51:04.109341    59 net.cpp:148] Top shape: 400 96 109 109 (456230400)
    I0428 22:51:04.109398    59 net.cpp:156] Memory required for data: 2065768000
    I0428 22:51:04.109553    59 layer_factory.hpp:77] Creating layer relu1
    I0428 22:51:04.109599    59 net.cpp:91] Creating Layer relu1
    I0428 22:51:04.109633    59 net.cpp:425] relu1 <- conv1
    I0428 22:51:04.109670    59 net.cpp:386] relu1 -> conv1 (in-place)
    I0428 22:51:04.110841    59 net.cpp:141] Setting up relu1
    I0428 22:51:04.111608    59 net.cpp:148] Top shape: 400 96 109 109 (456230400)
    I0428 22:51:04.111649    59 net.cpp:156] Memory required for data: 3890689600
    I0428 22:51:04.111726    59 layer_factory.hpp:77] Creating layer norm1
    I0428 22:51:04.111804    59 net.cpp:91] Creating Layer norm1
    I0428 22:51:04.111929    59 net.cpp:425] norm1 <- conv1
    I0428 22:51:04.111969    59 net.cpp:399] norm1 -> norm1
    I0428 22:51:04.112043    59 net.cpp:141] Setting up norm1
    I0428 22:51:04.112100    59 net.cpp:148] Top shape: 400 96 109 109 (456230400)
    I0428 22:51:04.112149    59 net.cpp:156] Memory required for data: 5715611200
    I0428 22:51:04.112201    59 layer_factory.hpp:77] Creating layer pool1
    I0428 22:51:04.112262    59 net.cpp:91] Creating Layer pool1
    I0428 22:51:04.112313    59 net.cpp:425] pool1 <- norm1
    I0428 22:51:04.112367    59 net.cpp:399] pool1 -> pool1
    I0428 22:51:04.112658    59 net.cpp:141] Setting up pool1
    I0428 22:51:04.112794    59 net.cpp:148] Top shape: 400 96 37 37 (52569600)
    I0428 22:51:04.112848    59 net.cpp:156] Memory required for data: 5925889600
    I0428 22:51:04.112884    59 layer_factory.hpp:77] Creating layer conv2
    I0428 22:51:04.112972    59 net.cpp:91] Creating Layer conv2
    I0428 22:51:04.113026    59 net.cpp:425] conv2 <- pool1
    I0428 22:51:04.113488    59 net.cpp:399] conv2 -> conv2
    I0428 22:51:04.115536    59 net.cpp:141] Setting up conv2
    I0428 22:51:04.115640    59 net.cpp:148] Top shape: 400 256 37 37 (140185600)
    I0428 22:51:04.115696    59 net.cpp:156] Memory required for data: 6486632000
    I0428 22:51:04.115751    59 layer_factory.hpp:77] Creating layer relu2
    I0428 22:51:04.115788    59 net.cpp:91] Creating Layer relu2
    I0428 22:51:04.115888    59 net.cpp:425] relu2 <- conv2
    I0428 22:51:04.115939    59 net.cpp:386] relu2 -> conv2 (in-place)
    I0428 22:51:04.116014    59 net.cpp:141] Setting up relu2
    I0428 22:51:04.116051    59 net.cpp:148] Top shape: 400 256 37 37 (140185600)
    I0428 22:51:04.116106    59 net.cpp:156] Memory required for data: 7047374400
    I0428 22:51:04.116142    59 layer_factory.hpp:77] Creating layer pool2
    I0428 22:51:04.116181    59 net.cpp:91] Creating Layer pool2
    I0428 22:51:04.116235    59 net.cpp:425] pool2 <- conv2
    I0428 22:51:04.116294    59 net.cpp:399] pool2 -> pool2
    I0428 22:51:04.116364    59 net.cpp:141] Setting up pool2
    I0428 22:51:04.116492    59 net.cpp:148] Top shape: 400 256 19 19 (36966400)
    I0428 22:51:04.116545    59 net.cpp:156] Memory required for data: 7195240000
    I0428 22:51:04.116581    59 layer_factory.hpp:77] Creating layer conv3
    I0428 22:51:04.116639    59 net.cpp:91] Creating Layer conv3
    I0428 22:51:04.116670    59 net.cpp:425] conv3 <- pool2
    I0428 22:51:04.116727    59 net.cpp:399] conv3 -> conv3
    I0428 22:51:04.134765    59 net.cpp:141] Setting up conv3
    I0428 22:51:04.134871    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.134928    59 net.cpp:156] Memory required for data: 7490971200
    I0428 22:51:04.135994    59 layer_factory.hpp:77] Creating layer relu3
    I0428 22:51:04.136255    59 net.cpp:91] Creating Layer relu3
    I0428 22:51:04.136296    59 net.cpp:425] relu3 <- conv3
    I0428 22:51:04.136435    59 net.cpp:386] relu3 -> conv3 (in-place)
    I0428 22:51:04.137774    59 net.cpp:141] Setting up relu3
    I0428 22:51:04.139025    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.139958    59 net.cpp:156] Memory required for data: 7786702400
    I0428 22:51:04.140475    59 layer_factory.hpp:77] Creating layer conv4
    I0428 22:51:04.141017    59 net.cpp:91] Creating Layer conv4
    I0428 22:51:04.141383    59 net.cpp:425] conv4 <- conv3
    I0428 22:51:04.141641    59 net.cpp:399] conv4 -> conv4
    I0428 22:51:04.165778    59 net.cpp:141] Setting up conv4
    I0428 22:51:04.165900    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.165962    59 net.cpp:156] Memory required for data: 8082433600
    I0428 22:51:04.168637    59 layer_factory.hpp:77] Creating layer relu4
    I0428 22:51:04.171306    59 net.cpp:91] Creating Layer relu4
    I0428 22:51:04.171368    59 net.cpp:425] relu4 <- conv4
    I0428 22:51:04.171439    59 net.cpp:386] relu4 -> conv4 (in-place)
    I0428 22:51:04.175688    59 net.cpp:141] Setting up relu4
    I0428 22:51:04.175788    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.175819    59 net.cpp:156] Memory required for data: 8378164800
    I0428 22:51:04.175881    59 layer_factory.hpp:77] Creating layer conv5
    I0428 22:51:04.175940    59 net.cpp:91] Creating Layer conv5
    I0428 22:51:04.175971    59 net.cpp:425] conv5 <- conv4
    I0428 22:51:04.176026    59 net.cpp:399] conv5 -> conv5
    I0428 22:51:04.194139    59 net.cpp:141] Setting up conv5
    I0428 22:51:04.194244    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.196287    59 net.cpp:156] Memory required for data: 8673896000
    I0428 22:51:04.201050    59 layer_factory.hpp:77] Creating layer relu5
    I0428 22:51:04.201668    59 net.cpp:91] Creating Layer relu5
    I0428 22:51:04.206367    59 net.cpp:425] relu5 <- conv5
    I0428 22:51:04.206445    59 net.cpp:386] relu5 -> conv5 (in-place)
    I0428 22:51:04.208932    59 net.cpp:141] Setting up relu5
    I0428 22:51:04.209012    59 net.cpp:148] Top shape: 400 512 19 19 (73932800)
    I0428 22:51:04.209039    59 net.cpp:156] Memory required for data: 8969627200
    I0428 22:51:04.209074    59 layer_factory.hpp:77] Creating layer pool5
    I0428 22:51:04.209153    59 net.cpp:91] Creating Layer pool5
    I0428 22:51:04.209192    59 net.cpp:425] pool5 <- conv5
    I0428 22:51:04.210391    59 net.cpp:399] pool5 -> pool5
    I0428 22:51:04.211598    59 net.cpp:141] Setting up pool5
    I0428 22:51:04.216861    59 net.cpp:148] Top shape: 400 512 7 7 (10035200)
    I0428 22:51:04.217041    59 net.cpp:156] Memory required for data: 9009768000
    I0428 22:51:04.217103    59 layer_factory.hpp:77] Creating layer fc6
    I0428 22:51:04.219173    59 net.cpp:91] Creating Layer fc6
    I0428 22:51:04.219277    59 net.cpp:425] fc6 <- pool5
    I0428 22:51:04.219324    59 net.cpp:399] fc6 -> fc6
    I0428 22:51:04.773458    59 net.cpp:141] Setting up fc6
    I0428 22:51:04.777616    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.778857    59 net.cpp:156] Memory required for data: 9016244800
    I0428 22:51:04.781023    59 layer_factory.hpp:77] Creating layer relu6
    I0428 22:51:04.784178    59 net.cpp:91] Creating Layer relu6
    I0428 22:51:04.788236    59 net.cpp:425] relu6 <- fc6
    I0428 22:51:04.790361    59 net.cpp:386] relu6 -> fc6 (in-place)
    I0428 22:51:04.792532    59 net.cpp:141] Setting up relu6
    I0428 22:51:04.792620    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.792671    59 net.cpp:156] Memory required for data: 9022721600
    I0428 22:51:04.792724    59 layer_factory.hpp:77] Creating layer drop6
    I0428 22:51:04.792795    59 net.cpp:91] Creating Layer drop6
    I0428 22:51:04.793380    59 net.cpp:425] drop6 <- fc6
    I0428 22:51:04.793471    59 net.cpp:386] drop6 -> fc6 (in-place)
    I0428 22:51:04.794314    59 net.cpp:141] Setting up drop6
    I0428 22:51:04.795964    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.796800    59 net.cpp:156] Memory required for data: 9029198400
    I0428 22:51:04.797582    59 layer_factory.hpp:77] Creating layer fc7
    I0428 22:51:04.797665    59 net.cpp:91] Creating Layer fc7
    I0428 22:51:04.798545    59 net.cpp:425] fc7 <- fc6
    I0428 22:51:04.798630    59 net.cpp:399] fc7 -> fc7
    I0428 22:51:04.828491    62 blocking_queue.cpp:50] Waiting for data
    I0428 22:51:04.880416    59 net.cpp:141] Setting up fc7
    I0428 22:51:04.880659    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.880733    59 net.cpp:156] Memory required for data: 9035675200
    I0428 22:51:04.880820    59 layer_factory.hpp:77] Creating layer relu7
    I0428 22:51:04.880908    59 net.cpp:91] Creating Layer relu7
    I0428 22:51:04.880982    59 net.cpp:425] relu7 <- fc7
    I0428 22:51:04.881057    59 net.cpp:386] relu7 -> fc7 (in-place)
    I0428 22:51:04.881140    59 net.cpp:141] Setting up relu7
    I0428 22:51:04.881214    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.881286    59 net.cpp:156] Memory required for data: 9042152000
    I0428 22:51:04.881357    59 layer_factory.hpp:77] Creating layer drop7
    I0428 22:51:04.881438    59 net.cpp:91] Creating Layer drop7
    I0428 22:51:04.881507    59 net.cpp:425] drop7 <- fc7
    I0428 22:51:04.881594    59 net.cpp:386] drop7 -> fc7 (in-place)
    I0428 22:51:04.881676    59 net.cpp:141] Setting up drop7
    I0428 22:51:04.881752    59 net.cpp:148] Top shape: 400 4048 (1619200)
    I0428 22:51:04.881820    59 net.cpp:156] Memory required for data: 9048628800
    I0428 22:51:04.881891    59 layer_factory.hpp:77] Creating layer fc8_cat
    I0428 22:51:04.881965    59 net.cpp:91] Creating Layer fc8_cat
    I0428 22:51:04.882040    59 net.cpp:425] fc8_cat <- fc7
    I0428 22:51:04.882113    59 net.cpp:399] fc8_cat -> fc8
    I0428 22:51:04.882292    59 net.cpp:141] Setting up fc8_cat
    I0428 22:51:04.882369    59 net.cpp:148] Top shape: 400 6 (2400)
    I0428 22:51:04.882429    59 net.cpp:156] Memory required for data: 9048638400
    I0428 22:51:04.882500    59 layer_factory.hpp:77] Creating layer prob
    I0428 22:51:04.882591    59 net.cpp:91] Creating Layer prob
    I0428 22:51:04.882678    59 net.cpp:425] prob <- fc8
    I0428 22:51:04.886852    59 net.cpp:425] prob <- label
    I0428 22:51:04.886905    59 net.cpp:399] prob -> (automatic)
    I0428 22:51:04.887187    59 layer_factory.hpp:77] Creating layer prob
    I0428 22:51:04.888458    59 net.cpp:141] Setting up prob
    I0428 22:51:04.888552    59 net.cpp:148] Top shape: (1)
    I0428 22:51:04.888584    59 net.cpp:151]     with loss weight 1
    I0428 22:51:04.888667    59 net.cpp:156] Memory required for data: 9048638404
    I0428 22:51:04.888703    59 net.cpp:217] prob needs backward computation.
    I0428 22:51:04.888746    59 net.cpp:217] fc8_cat needs backward computation.
    I0428 22:51:04.888803    59 net.cpp:217] drop7 needs backward computation.
    I0428 22:51:04.888860    59 net.cpp:217] relu7 needs backward computation.
    I0428 22:51:04.888916    59 net.cpp:217] fc7 needs backward computation.
    I0428 22:51:04.888969    59 net.cpp:217] drop6 needs backward computation.
    I0428 22:51:04.889027    59 net.cpp:217] relu6 needs backward computation.
    I0428 22:51:04.889086    59 net.cpp:217] fc6 needs backward computation.
   (...)
    I0428 22:51:04.896559    59 net.cpp:274] Network initialization done.
    I0428 22:51:04.908800    59 upgrade_proto.cpp:52] Attempting to upgrade input file specified using deprecated V1LayerParameter: models/Custom_Model/train.prototxt
    I0428 22:51:04.909487    59 upgrade_proto.cpp:60] Successfully upgraded file specified using deprecated V1LayerParameter
    I0428 22:51:04.910534    59 solver.cpp:181] Creating test net (#0) specified by net file: models/Custom_Model/train.prototxt
    I0428 22:51:04.910686    59 net.cpp:313] The NetState phase (1) differed from the phase (0) specified by a rule in layer training_train
    I0428 22:51:04.912101    59 net.cpp:49] Initializing net from parameters: 
    name: "CaffeNet"
    state {
      phase: TEST
    }
    layer {
      name: "training_test"
      type: "Data"
      top: "data"
      top: "label"
      include {
        phase: TEST
      }
      transform_param {
        mean_file: "datasets/mean_training_image.binaryproto"
      }
      data_param {
        source: "datasets/validation_set_lmdb"
        batch_size: 14
        backend: LMDB
      }
    }
    layer {
      name: "conv1"
      type: "Convolution"
      bottom: "data"
      top: "conv1"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 96
        kernel_size: 7
        stride: 2
      }
    }
    layer {
      name: "relu1"
      type: "ReLU"
      bottom: "conv1"
      top: "conv1"
    }
    layer {
      name: "norm1"
      type: "LRN"
      bottom: "conv1"
      top: "norm1"
      lrn_param {
        local_size: 5
        alpha: 0.0005
        beta: 0.75
      }
    }
    layer {
      name: "pool1"
      type: "Pooling"
      bottom: "norm1"
      top: "pool1"
      pooling_param {
        pool: MAX
        kernel_size: 3
        stride: 3
      }
    }
    layer {
      name: "conv2"
      type: "Convolution"
      bottom: "pool1"
      top: "conv2"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 256
        pad: 2
        kernel_size: 5
      }
    }
    layer {
      name: "relu2"
      type: "ReLU"
      bottom: "conv2"
      top: "conv2"
    }
    layer {
      name: "pool2"
      type: "Pooling"
      bottom: "conv2"
      top: "pool2"
      pooling_param {
        pool: MAX
        kernel_size: 2
        stride: 2
      }
    }
    layer {
      name: "conv3"
      type: "Convolution"
      bottom: "pool2"
      top: "conv3"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu3"
      type: "ReLU"
      bottom: "conv3"
      top: "conv3"
    }
    layer {
      name: "conv4"
      type: "Convolution"
      bottom: "conv3"
      top: "conv4"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu4"
      type: "ReLU"
      bottom: "conv4"
      top: "conv4"
    }
    layer {
      name: "conv5"
      type: "Convolution"
      bottom: "conv4"
      top: "conv5"
      param {
        lr_mult: 0
      }
      param {
        lr_mult: 0
      }
      convolution_param {
        num_output: 512
        pad: 1
        kernel_size: 3
      }
    }
    layer {
      name: "relu5"
      type: "ReLU"
      bottom: "conv5"
      top: "conv5"
    }
    layer {
      name: "pool5"
      type: "Pooling"
      bottom: "conv5"
      top: "pool5"
      pooling_param {
        pool: MAX
        kernel_size: 3
        stride: 3
      }
    }
    layer {
      name: "fc6"
      type: "InnerProduct"
      bottom: "pool5"
      top: "fc6"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 4048
      }
    }
    layer {
      name: "relu6"
      type: "ReLU"
      bottom: "fc6"
      top: "fc6"
    }
    layer {
      name: "drop6"
      type: "Dropout"
      bottom: "fc6"
      top: "fc6"
      dropout_param {
        dropout_ratio: 0.5
      }
    }
    layer {
      name: "fc7"
      type: "InnerProduct"
      bottom: "fc6"
      top: "fc7"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 4048
      }
    }
    layer {
      name: "relu7"
      type: "ReLU"
      bottom: "fc7"
      top: "fc7"
    }
    layer {
      name: "drop7"
      type: "Dropout"
      bottom: "fc7"
      top: "fc7"
      dropout_param {
        dropout_ratio: 0.5
      }
    }
    layer {
      name: "fc8_cat"
      type: "InnerProduct"
      bottom: "fc7"
      top: "fc8"
      param {
        lr_mult: 1
      }
      param {
        lr_mult: 1
      }
      inner_product_param {
        num_output: 6
      }
    }
    layer {
      name: "prob"
      type: "SoftmaxWithLoss"
      bottom: "fc8"
      bottom: "label"
    }
    I0428 22:51:04.915211    59 layer_factory.hpp:77] Creating layer training_test
    I0428 22:51:04.916718    59 net.cpp:91] Creating Layer training_test
    I0428 22:51:04.916820    59 net.cpp:399] training_test -> data
    I0428 22:51:04.916895    59 net.cpp:399] training_test -> label
    I0428 22:51:04.916968    59 data_transformer.cpp:25] Loading mean file from: datasets/mean_training_image.binaryproto
    I0428 22:51:04.957635    63 db_lmdb.cpp:35] Opened lmdb datasets/validation_set_lmdb
    I0428 22:51:04.966471    59 data_layer.cpp:41] output data size: 14,3,224,224
    I0428 22:51:04.986405    59 net.cpp:141] Setting up training_test
    I0428 22:51:04.987761    59 net.cpp:148] Top shape: 14 3 224 224 (2107392)
    I0428 22:51:04.988591    59 net.cpp:148] Top shape: 14 (14)
    I0428 22:51:04.988828    59 net.cpp:156] Memory required for data: 8429624
    I0428 22:51:04.991192    59 layer_factory.hpp:77] Creating layer conv1
    I0428 22:51:04.992264    59 net.cpp:91] Creating Layer conv1
    I0428 22:51:04.992722    59 net.cpp:425] conv1 <- data
    I0428 22:51:04.993867    59 net.cpp:399] conv1 -> conv1
    I0428 22:51:04.994596    59 net.cpp:141] Setting up conv1
   (...)
    I0428 22:51:05.945319    59 net.cpp:274] Network initialization done.
    I0428 22:51:05.946696    59 solver.cpp:60] Solver scaffolding done.
    I0428 22:51:05.948148    59 caffe.cpp:219] Starting Optimization
    I0428 22:51:05.948653    59 solver.cpp:279] Solving CaffeNet
    I0428 22:51:05.949687    59 solver.cpp:280] Learning Rate Policy: fixed
    I0428 22:51:10.701836    59 solver.cpp:337] Iteration 0, Testing net (#0)
    I0428 22:51:10.705909    59 net.cpp:684] Ignoring source layer training_train
    Killed

有人能告诉我这里我做错了什么吗?

编辑:作为未来参考的旁注,接受的答案解决了问题,但是培训在最初snapshot中止。

显然内存仍然存在问题,为了使其有效,我必须在solver.prototxt添加此行,可能是由于data.mdb (5,01 GB)的大小:

snapshot_format: HDF5

相关:https://github.com/BVLC/caffe/pull/2836

然后一切正常。

1 个答案:

答案 0 :(得分:1)

您正在使用solver_mode: CPU,在此培训开始时您是否检查了CPU内存利用率?

您正在使用非常高的培训batch_size

layer {
          name: "training_train"
          type: "Data"
          top: "data"
          top: "label"
          include {
            phase: TRAIN
          }
          transform_param {
            mean_file: "datasets/mean_training_image.binaryproto"
          }
          data_param {
            source: "datasets/training_set_lmdb"
            batch_size: 400
            backend: LMDB
          }
        }

因此,系统CPU上无法使用此batch_size: 400所需的内存。

因此减少batch_size并进行培训。我们说batch_size: 20

了解系统中的可用内存后,您可以粗略计算出可以使用的batch_size