Question

我正在使用tensorflow在google云中训练一个conv网络，当我开始时出现此错误复制主0内存不足并以非零状态9（SIGKILL）退出，我将我的masterType更改为大模型，复杂模型l，复杂模型m，但是我一直遇到相同的错误。也许我必须做一个自定义的缩放层，指定所有这些：

trainingInput:
  scaleTier: CUSTOM
  masterType: complex_model_m
  workerType: complex_model_m
  parameterServerType: large_model
  workerCount: 9
  parameterServerCount: 3

我没有发现它们各自的含义我的模型不是很复杂，前一段时间我在basic_gpu中的tf.record文件中进行了训练，并且一切正常，现在我只是将tf.record的数据库增加了100000张图像，现在又遇到了问题。谁能解释造成内存不足的原因以及如何编写config.yaml文件以避免此类问题，并在云中训练数据？

这是我的代码

#Imports..........................................................
import tensorflow as tf
import numpy as np
import os
import argparse
from tensorflow.contrib.learn.python.learn.utils import (saved_model_export_utils)
from tensorflow.contrib.training.python.training import hparam


# Parameters.......................................................
# Batch_size for training and evaluation
batch_size_train=102455
batch_size_val=25614

# Set the messages that will be logged
tf.logging.set_verbosity(tf.logging.INFO)

# Session is Created
sess=tf.Session()


# Convolutional Model.......................................................................
def cnn_model(features,labels,mode):

    # Input to the model
    # Reshape to 4-D tensor: [batch_size, width, height, channels]
    input_layer=tf.reshape(features["x"],[-1,224,244,3])

    # Summary to print the images in tensorboard
    tf.summary.image("input",input_layer,1)

    # Convolutional Model...................................
    # Convolutional Layer #1
    # Computes 30 filters using a 10x10 filter with ReLU activation.
    # Padding "same" is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 224, 224, 3]
    # Output Tensor Shape: [batch_size, 224, 224, 30]
    conv1=tf.layers.conv2d(
        inputs=input_layer,
        filters=32,
        kernel_size=[10,10],
        padding="same",
        activation=tf.nn.relu)

    # Pooling Layer #1
    # First max pooling layer with a 4x4 filter and stride of 4
    # Input Tensor Shape: [batch_size, 224, 224, 3]
    # Output Tensor Shape: [batch_size, 56, 56, 30]
    pool1=tf.layers.max_pooling2d(inputs=conv1,pool_size=[4,4],strides=4)


    # Convolutional Layer #2
    # Computes 60 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 56, 56, 30]
    # Output Tensor Shape: [batch_size, 56, 56, 60]
    conv2=tf.layers.conv2d(
        inputs=pool1,
        filters=64,
        kernel_size=[5,5],
        padding="same",
        activation=tf.nn.relu)

    # Pooling Layer #2
    # First max pooling layer with a 4x4 filter and stride of 4
    # Input Tensor Shape: [batch_size, 56, 56, 60]
    # Output Tensor Shape: [batch_size, 14, 14, 60]
    pool2=tf.layers.max_pooling2d(inputs=conv2,pool_size=[4,4],strides=4)


    # Convolutional Layer #3
    # Computes 60 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 60]
    # Output Tensor Shape: [batch_size, 14, 14, 90]
    conv3=tf.layers.conv2d(
        inputs=pool2,
        filters=128,
        kernel_size=[5,5],
        padding="same",
        activation=tf.nn.relu)

    # Pooling Layer #3
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 14, 14, 90]
    # Output Tensor Shape: [batch_size, 7, 7, 90]
    pool3=tf.layers.max_pooling2d(inputs=conv3,pool_size=[2,2],strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 7, 7, 90]
    # Output Tensor Shape: [batch_size, 7 * 7 * 90]
    pool3_flat=tf.reshape(pool3,[-1,7*7*128])
    #...............................................


    # Neural Network................................
    dense=tf.layers.dense(inputs=pool3_flat,units=2000,activation=tf.nn.relu)
    # Add dropout operation; 0.6 probability that element will be kept
    dropout=tf.layers.dropout(inputs=dense,rate=0.4,training=mode ==tf.estimator.ModeKeys.TRAIN)
    logits=tf.layers.dense(inputs=dropout,units=2)
    #..............................................


    #Calculate the loss............................
    onehot_labels=tf.one_hot(indices=labels,depth=2)
    loss=tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels,logits=logits)
    #...............................................

    #Predictions "classes": one hot label of logits - "probabilities of logits"
    predictions={"classes":tf.argmax(input=logits,axis=1),"probabilities":tf.nn.softmax(logits,name="softmax_tensor")}

    #Calculate the accuracy........................
    accuracy=tf.metrics.accuracy(labels=labels,predictions=predictions["classes"])
    metrics={"accuracy":accuracy}
    tf.summary.scalar("accuracy",accuracy[1])
    #.............................................

    #Training.....................................
    if mode==tf.estimator.ModeKeys.TRAIN:
        optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001)
        train_op=optimizer.minimize(loss=loss,global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode,loss=loss,train_op=train_op)
    #.............................................

    #Validation....................................
    if mode==tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode=mode,loss=loss,eval_metric_ops=metrics)
    #..............................................



# Function to read the train record........................................................
def input_pipeline_train(train_files,batch_size_train):

    # Create a list of filenames and pass it to a queue
    filename_queue=tf.train.string_input_producer([train_files],num_epochs=1,shuffle=True)

    # Define a reader and read the next record
    reader=tf.TFRecordReader()
    _,serialized_example=reader.read(filename_queue)

    # Decode the record read by the reader
    feature={'train/image':tf.FixedLenFeature([],tf.string),
             'train/label':tf.FixedLenFeature([],tf.int64)}
    features=tf.parse_single_example(serialized_example,features=feature)

    #Convert the image data from string back to numbers, then resize it in the original shape
    image=tf.decode_raw(features['train/image'],tf.float32)
    image=tf.reshape(image,[224,224,3])

    #Cast label into int32
    label=tf.cast(features['train/label'],tf.int32)

    #Take the hole batch of images, and makes a shuffle
    images,labels=tf.train.shuffle_batch([image,label],batch_size=batch_size_train,capacity=batch_size_train+1,num_threads=5,min_after_dequeue=batch_size_train)

    return images,labels


# Function to read the validation record................................................
def input_pipeline_val(val_files,batch_size_val):

    # Create a list of filenames and pass it to a queue
    filename_queue=tf.train.string_input_producer([val_files],num_epochs=1,shuffle=True)

    # Define a reader and read the next record
    reader=tf.TFRecordReader()
    _,serialized_example=reader.read(filename_queue)

    # Decode the record read by the reader
    feature={'val/image':tf.FixedLenFeature([],tf.string),
             'val/label':tf.FixedLenFeature([],tf.int64)}
    features=tf.parse_single_example(serialized_example,features=feature)

    #Convert the image data from string back to numbers, then resize it in the original shape
    image=tf.decode_raw(features['val/image'],tf.float32)
    image=tf.reshape(image,[224,224,3])

    #Cast label into int32
    label=tf.cast(features['val/label'],tf.int32)

    #Take the hole batch of images, and makes a shuffle
    images,labels=tf.train.shuffle_batch([image,label],batch_size=batch_size_val,capacity=batch_size_val+1,num_threads=1,min_after_dequeue=batch_size_val)

    return images,labels



#Main Function............................................................................................
def run(hparams):

    #Read the tf.records
    img_train,lbl_train=input_pipeline_train(hparams.train_files,batch_size_train)
    #img_val,lbl_val=input_pipeline_val(hparams.val_files,batch_size_val)

    #Define the model
    detector=tf.estimator.Estimator(model_fn=cnn_model,model_dir=hparams.job_dir)

    tensor_to_log={"probabilities":"softmax_tensor"}
    logging_hook=tf.train.LoggingTensorHook(tensors=tensor_to_log,every_n_iter=200)

    #Variables Initialization
    init_op=tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())
    sess.run(init_op)

    #The coordinator indicates to the threads when the queue is empty
    coord=tf.train.Coordinator()
    #Start the threads (the queue runners)
    threads=tf.train.start_queue_runners(sess=sess,coord=coord)


    try:
        while not coord.should_stop():

            #Obtain the hole batch of training images
            img_t,lbl_t=sess.run([img_train,lbl_train])

            #Enter the inputs to the model for training
            train_input_fn=tf.estimator.inputs.numpy_input_fn(
                x={"x":img_t},
                y=lbl_t,
                batch_size=100,
                num_epochs=None,
                shuffle=True)
            detector.train(
                imput_fn=train_input_fn,
                steps=17000,
                hooks=[logging_hook])

            #Obtain the hole batch of validation images
            #img_v,lbl_v=sess.run([img_val,lbl_val])

            #Enter the inputs to the model for validation
            #val_input_fn=tf.estimator.inputs.numpy_input_fn(
            #   x={"x":img_v},
            #   y=lbl_v,
            #   num_epochs=1,
            #   shuffle=False)
            #val_results=detector.evaluate(input_fn=val_input_fn)
            #print(val_results)

    except tf.errors.OutOfRangeError:
        print("Done training -- epoch limit reached")
    finally:
        coord.request_stop()
        coord.join(threads)
        sess.close()




if __name__ == '__main__':

    #To pass parameters when the program will be ejecuted
    parser=argparse.ArgumentParser()

    parser.add_argument(
        '--train-files',
        help='Path of the training record',
        required=True)

    #parser.add_argument(
    #   'val_files',
    #   help='Path of the evaluation record',
    #   required=True)

    parser.add_argument(
        '--job-dir',
        help='Location to export the model',
        required=True)

    args=parser.parse_args()
    hparams=hparam.HParams(**args.__dict__)
    run(hparams)

Google Cloud ML：副本母版0内存不足，并以非零状态9（SIGKILL）退出

0 个答案: