我正在使用tensorflow在google云中训练一个conv网络,当我开始时出现此错误复制主0内存不足并以非零状态9(SIGKILL)退出,我将我的masterType更改为大模型,复杂模型l,复杂模型m,但是我一直遇到相同的错误。也许我必须做一个自定义的缩放层,指定所有这些:
trainingInput:
scaleTier: CUSTOM
masterType: complex_model_m
workerType: complex_model_m
parameterServerType: large_model
workerCount: 9
parameterServerCount: 3
我没有发现它们各自的含义 我的模型不是很复杂,前一段时间我在basic_gpu中的tf.record文件中进行了训练,并且一切正常,现在我只是将tf.record的数据库增加了100000张图像,现在又遇到了问题。 谁能解释造成内存不足的原因以及如何编写config.yaml文件以避免此类问题,并在云中训练数据?
这是我的代码
#Imports..........................................................
import tensorflow as tf
import numpy as np
import os
import argparse
from tensorflow.contrib.learn.python.learn.utils import (saved_model_export_utils)
from tensorflow.contrib.training.python.training import hparam
# Parameters.......................................................
# Batch_size for training and evaluation
batch_size_train=102455
batch_size_val=25614
# Set the messages that will be logged
tf.logging.set_verbosity(tf.logging.INFO)
# Session is Created
sess=tf.Session()
# Convolutional Model.......................................................................
def cnn_model(features,labels,mode):
# Input to the model
# Reshape to 4-D tensor: [batch_size, width, height, channels]
input_layer=tf.reshape(features["x"],[-1,224,244,3])
# Summary to print the images in tensorboard
tf.summary.image("input",input_layer,1)
# Convolutional Model...................................
# Convolutional Layer #1
# Computes 30 filters using a 10x10 filter with ReLU activation.
# Padding "same" is added to preserve width and height.
# Input Tensor Shape: [batch_size, 224, 224, 3]
# Output Tensor Shape: [batch_size, 224, 224, 30]
conv1=tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[10,10],
padding="same",
activation=tf.nn.relu)
# Pooling Layer #1
# First max pooling layer with a 4x4 filter and stride of 4
# Input Tensor Shape: [batch_size, 224, 224, 3]
# Output Tensor Shape: [batch_size, 56, 56, 30]
pool1=tf.layers.max_pooling2d(inputs=conv1,pool_size=[4,4],strides=4)
# Convolutional Layer #2
# Computes 60 features using a 5x5 filter with ReLU activation.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 56, 56, 30]
# Output Tensor Shape: [batch_size, 56, 56, 60]
conv2=tf.layers.conv2d(
inputs=pool1,
filters=64,
kernel_size=[5,5],
padding="same",
activation=tf.nn.relu)
# Pooling Layer #2
# First max pooling layer with a 4x4 filter and stride of 4
# Input Tensor Shape: [batch_size, 56, 56, 60]
# Output Tensor Shape: [batch_size, 14, 14, 60]
pool2=tf.layers.max_pooling2d(inputs=conv2,pool_size=[4,4],strides=4)
# Convolutional Layer #3
# Computes 60 features using a 5x5 filter with ReLU activation.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 14, 14, 60]
# Output Tensor Shape: [batch_size, 14, 14, 90]
conv3=tf.layers.conv2d(
inputs=pool2,
filters=128,
kernel_size=[5,5],
padding="same",
activation=tf.nn.relu)
# Pooling Layer #3
# First max pooling layer with a 2x2 filter and stride of 2
# Input Tensor Shape: [batch_size, 14, 14, 90]
# Output Tensor Shape: [batch_size, 7, 7, 90]
pool3=tf.layers.max_pooling2d(inputs=conv3,pool_size=[2,2],strides=2)
# Flatten tensor into a batch of vectors
# Input Tensor Shape: [batch_size, 7, 7, 90]
# Output Tensor Shape: [batch_size, 7 * 7 * 90]
pool3_flat=tf.reshape(pool3,[-1,7*7*128])
#...............................................
# Neural Network................................
dense=tf.layers.dense(inputs=pool3_flat,units=2000,activation=tf.nn.relu)
# Add dropout operation; 0.6 probability that element will be kept
dropout=tf.layers.dropout(inputs=dense,rate=0.4,training=mode ==tf.estimator.ModeKeys.TRAIN)
logits=tf.layers.dense(inputs=dropout,units=2)
#..............................................
#Calculate the loss............................
onehot_labels=tf.one_hot(indices=labels,depth=2)
loss=tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels,logits=logits)
#...............................................
#Predictions "classes": one hot label of logits - "probabilities of logits"
predictions={"classes":tf.argmax(input=logits,axis=1),"probabilities":tf.nn.softmax(logits,name="softmax_tensor")}
#Calculate the accuracy........................
accuracy=tf.metrics.accuracy(labels=labels,predictions=predictions["classes"])
metrics={"accuracy":accuracy}
tf.summary.scalar("accuracy",accuracy[1])
#.............................................
#Training.....................................
if mode==tf.estimator.ModeKeys.TRAIN:
optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op=optimizer.minimize(loss=loss,global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode,loss=loss,train_op=train_op)
#.............................................
#Validation....................................
if mode==tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode=mode,loss=loss,eval_metric_ops=metrics)
#..............................................
# Function to read the train record........................................................
def input_pipeline_train(train_files,batch_size_train):
# Create a list of filenames and pass it to a queue
filename_queue=tf.train.string_input_producer([train_files],num_epochs=1,shuffle=True)
# Define a reader and read the next record
reader=tf.TFRecordReader()
_,serialized_example=reader.read(filename_queue)
# Decode the record read by the reader
feature={'train/image':tf.FixedLenFeature([],tf.string),
'train/label':tf.FixedLenFeature([],tf.int64)}
features=tf.parse_single_example(serialized_example,features=feature)
#Convert the image data from string back to numbers, then resize it in the original shape
image=tf.decode_raw(features['train/image'],tf.float32)
image=tf.reshape(image,[224,224,3])
#Cast label into int32
label=tf.cast(features['train/label'],tf.int32)
#Take the hole batch of images, and makes a shuffle
images,labels=tf.train.shuffle_batch([image,label],batch_size=batch_size_train,capacity=batch_size_train+1,num_threads=5,min_after_dequeue=batch_size_train)
return images,labels
# Function to read the validation record................................................
def input_pipeline_val(val_files,batch_size_val):
# Create a list of filenames and pass it to a queue
filename_queue=tf.train.string_input_producer([val_files],num_epochs=1,shuffle=True)
# Define a reader and read the next record
reader=tf.TFRecordReader()
_,serialized_example=reader.read(filename_queue)
# Decode the record read by the reader
feature={'val/image':tf.FixedLenFeature([],tf.string),
'val/label':tf.FixedLenFeature([],tf.int64)}
features=tf.parse_single_example(serialized_example,features=feature)
#Convert the image data from string back to numbers, then resize it in the original shape
image=tf.decode_raw(features['val/image'],tf.float32)
image=tf.reshape(image,[224,224,3])
#Cast label into int32
label=tf.cast(features['val/label'],tf.int32)
#Take the hole batch of images, and makes a shuffle
images,labels=tf.train.shuffle_batch([image,label],batch_size=batch_size_val,capacity=batch_size_val+1,num_threads=1,min_after_dequeue=batch_size_val)
return images,labels
#Main Function............................................................................................
def run(hparams):
#Read the tf.records
img_train,lbl_train=input_pipeline_train(hparams.train_files,batch_size_train)
#img_val,lbl_val=input_pipeline_val(hparams.val_files,batch_size_val)
#Define the model
detector=tf.estimator.Estimator(model_fn=cnn_model,model_dir=hparams.job_dir)
tensor_to_log={"probabilities":"softmax_tensor"}
logging_hook=tf.train.LoggingTensorHook(tensors=tensor_to_log,every_n_iter=200)
#Variables Initialization
init_op=tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())
sess.run(init_op)
#The coordinator indicates to the threads when the queue is empty
coord=tf.train.Coordinator()
#Start the threads (the queue runners)
threads=tf.train.start_queue_runners(sess=sess,coord=coord)
try:
while not coord.should_stop():
#Obtain the hole batch of training images
img_t,lbl_t=sess.run([img_train,lbl_train])
#Enter the inputs to the model for training
train_input_fn=tf.estimator.inputs.numpy_input_fn(
x={"x":img_t},
y=lbl_t,
batch_size=100,
num_epochs=None,
shuffle=True)
detector.train(
imput_fn=train_input_fn,
steps=17000,
hooks=[logging_hook])
#Obtain the hole batch of validation images
#img_v,lbl_v=sess.run([img_val,lbl_val])
#Enter the inputs to the model for validation
#val_input_fn=tf.estimator.inputs.numpy_input_fn(
# x={"x":img_v},
# y=lbl_v,
# num_epochs=1,
# shuffle=False)
#val_results=detector.evaluate(input_fn=val_input_fn)
#print(val_results)
except tf.errors.OutOfRangeError:
print("Done training -- epoch limit reached")
finally:
coord.request_stop()
coord.join(threads)
sess.close()
if __name__ == '__main__':
#To pass parameters when the program will be ejecuted
parser=argparse.ArgumentParser()
parser.add_argument(
'--train-files',
help='Path of the training record',
required=True)
#parser.add_argument(
# 'val_files',
# help='Path of the evaluation record',
# required=True)
parser.add_argument(
'--job-dir',
help='Location to export the model',
required=True)
args=parser.parse_args()
hparams=hparam.HParams(**args.__dict__)
run(hparams)