保存模型时Tensorflow DNNRegressor错误?

时间:2017-04-24 03:30:06

标签: python python-2.7 machine-learning tensorflow deep-learning

我使用DNNRegressor作为遗传算法的适应度函数,因此我需要执行数千次,然后在每次迭代中保存模型。这里显示了代码的轻量级版本。模拟在某个点之后很好,然后出现错误并且模拟停止。我正在使用Tensorflow 1.0.1,如果您有任何建议可以解决这个问题,我将非常感谢您的帮助。

错误:

  

tensorflow.python.framework.errors_impl.InvalidArgumentError:   不成功的TensorSliceReader构造函数:无法获得匹配   文件   /home/xxxx/workspace2/NNEvolution/4_15_136/6/model.ckpt-1500:   资源耗尽:/ home / xxxx / workspace2 / NNEvolution / 4_15_136 / 6
  [[节点:save / RestoreV2_5 = RestoreV2 [dtypes = [DT_FLOAT],   _device =" / job:localhost / replica:0 / task:0 / cpu:0"](_ recv_save / Const_0,save / RestoreV2_5 / tensor_names,save / RestoreV2_5 / shape_and_slices)]]
  [[节点:save / RestoreV2_7 / _7 = _Recvclient_terminated = false,   recv_device =" /作业:本地主机/复制:0 /任务:0 / GPU:0&#34 ;,   send_device =" /作业:本地主机/复制:0 /任务:0 / CPU:0&#34 ;,   send_device_incarnation = 1,tensor_name =" edge_6_save / RestoreV2_7",   tensor_type = DT_FLOAT,   _device =" /作业:本地主机/复制:0 /任务:0 / GPU:0"]]

     

InvalidArgumentError(参见上面的回溯):不成功   TensorSliceReader构造函数:无法获取匹配的文件   /home/xxxx/workspace2/NNEvolution/4_15_136/6/model.ckpt-1500:   资源耗尽:/ home / xxxx / workspace2 / NNEvolution / 4_15_136 / 6
  [[节点:save / RestoreV2_5 = RestoreV2 [dtypes = [DT_FLOAT],   _device =" / job:localhost / replica:0 / task:0 / cpu:0"](_ recv_save / Const_0,save / RestoreV2_5 / tensor_names,save / RestoreV2_5 / shape_and_slices)]]
  [[节点:save / RestoreV2_7 / _7 = _Recvclient_terminated = false,   recv_device =" /作业:本地主机/复制:0 /任务:0 / GPU:0&#34 ;,   send_device =" /作业:本地主机/复制:0 /任务:0 / CPU:0&#34 ;,   send_device_incarnation = 1,tensor_name =" edge_6_save / RestoreV2_7",   tensor_type = DT_FLOAT,   _device =" /作业:本地主机/复制:0 /任务:0 / GPU:0"]]

     

处理以退出代码139结束(由信号11中断:   SIGSEGV)

此处的代码详情:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import tensorflow as tf
import shutil

tf.logging.set_verbosity(tf.logging.INFO)
def FitnessFunction ( hhLayer, epoch, pathModel, train_set, val_set, test_set, every_n_steps, early_stopping_rounds, nnExperiment, gaExperiment, gaGeneration, globalIndCounter ):
    avgMSE = 0
    # Data sets
    config1 = tf.ConfigProto()
    config1.gpu_options.allow_growth = True
    feature_columns = [tf.contrib.layers.real_valued_column("")]
    for nnExp in range(0, nnExperiment):
        validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
            val_set.data,
            val_set.target,
            every_n_steps=every_n_steps,
            early_stopping_metric="loss",
            early_stopping_metric_minimize=False,
            early_stopping_rounds=early_stopping_rounds)
        #validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
        #    val_set.data,
         #   val_set.target,
         #   every_n_steps=every_n_steps,
         #   early_stopping_metric="loss",
         #   early_stopping_metric_minimize=True)
        # Specify that all features have real-value data
        strName = str(gaExperiment) + "_" + str(gaGeneration)+ "_"+str(globalIndCounter)
        #indPath = pathModel + "/" + strName +"/" + str( nnExp ) +"/"+ "model.ckpt"

        indPath = os.path.join( os.path.sep, pathModel, strName, str( nnExp ) )
        #if (os.path.isdir(pathModel)):
        #    shutil.rmtree(pathModel)
        #Creating the model
        #mlp = tf.contrib.learn.DNNRegressor( feature_columns=feature_columns, hidden_units=hhLayer, optimizer=tf.train.AdamOptimizer(1e-4), model_dir=pathModel, enable_centered_bias = True, dropout=0.4)
        #mlp = tf.contrib.learn.DNNRegressor(feature_columns=feature_columns, hidden_units=hhLayer,
        #                                    optimizer=tf.train.AdamOptimizer(1e-4), model_dir=pathModel,
        #                                    enable_centered_bias=True)
        #try:

        mlp = tf.contrib.learn.DNNRegressor(feature_columns=feature_columns, hidden_units=hhLayer, model_dir=indPath, activation_fn=tf.nn.relu,enable_centered_bias=True,
                                            dropout=0.01, config=tf.contrib.learn.RunConfig(gpu_memory_fraction=0.30, save_checkpoints_secs= 43200) )
        #except:
        #    a=1

        #mlp = tf.contrib.learn.DNNRegressor(feature_columns=feature_columns, hidden_units=hhLayer, model_dir=pathModel, enable_centered_bias=True, optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.1, l1_regularization_strength=0.001), dropout=0.01)
        # optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.1, l1_regularization_strength=0.001 )

        # Trainning the model.
        #try:
        mlp.fit( x=train_set.data, y=train_set.target, steps=epoch, monitors=[validation_monitor], batch_size=50 )
        #except:
        #    a=1

        MSE = mlp.evaluate( x=test_set.data, y=test_set.target)["loss"]

            # Classify two new flower samples.
        # new_samples = np.array(
        #    [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
        # y = list(classifier.predict(new_samples))
        # print("Predictions: {}".format(str(y)))
        avgMSE = avgMSE + MSE
        del mlp
    return (avgMSE/nnExperiment)
if __name__ == "__main__":
    # Data sets
    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True

    IRIS_TRAINING   = os.path.join(os.path.dirname(__file__), "boston_train.csv")
    IRIS_VAL        = os.path.join(os.path.dirname(__file__), "boston_val.csv")
    IRIS_TEST       = os.path.join(os.path.dirname(__file__), "boston_test.csv")

    train_set       = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TRAINING,  target_dtype=np.float, features_dtype=np.float )
    val_set         = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_VAL,       target_dtype=np.float, features_dtype=np.float )
    test_set        = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TEST,      target_dtype=np.float, features_dtype=np.float )

    #initialPath     = "/home/xxxx/workspace2/NNEvolution"
    initialPath     = os.path.join( os.path.sep, 'home', 'xxxx', 'workspace2', 'NNEvolution' )
    if ( os.path.isdir( initialPath ) ):
        shutil.rmtree( initialPath )
    idxCount        = 1
    for exp in range(0,50):
        for ger in range(0,30):
                mse         = FitnessFunction( [30,30,30,30], 1500, initialPath, train_set, val_set, test_set, 1, 200, nnExperiment=10, gaExperiment=exp,  gaGeneration=ger, globalIndCounter=idxCount)
                idxCount    = idxCount +1
                print("MSE: {0:f}".format(mse))

GPU details

0 个答案:

没有答案