我刚开始使用tensorflow。我试图用原始图像(944,944)喂养一些神经元,我后来重塑了BINARY分类。完整代码在这里:
import tensorflow as tf
import numpy as np
import os
# import cv2
from scipy import ndimage
import PIL
tf.logging.set_verbosity(tf.logging.INFO)
file_writer =tf.summary.FileWriter('./log',tf.Session().graph)
def define_model(features, labels, mode):
"""Model function for CNN."""
# Input Layer
input_layer = tf.reshape(features["x"], [-1, 512, 512, 1])
# Convolutional Layer #1
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[16, 16],
padding="same",
activation=tf.nn.relu)
# Pooling Layer #1
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
# Convolutional Layer #2 and Pooling Layer #2
conv2 = tf.layers.conv2d(
inputs=pool1,
filters=64,
kernel_size=[16, 16],
padding="same",
activation=tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
# Dense Layer
# pool2_shape = tf.shape(pool2)
# pool2_flat = tf.reshape (pool2,
[-1,pool2_shape[1]*pool2_shape[2]*pool2_shape[3]])
pool2_flat=tf.layers.flatten(pool2)
dense = tf.layers.dense(inputs=pool2_flat, units=1024,
activation=tf.nn.relu)
dropout = tf.layers.dropout(
inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
# Logits Layer - raw predictions
logits = tf.layers.dense(inputs=dropout, units=10)
predictions = {
# Generate predictions (for PREDICT and EVAL mode)
"classes": tf.argmax(input=logits, axis=1),
# Add `softmax_tensor` to the graph. It is used for PREDICT and by the
# `logging_hook`.
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=labels,
logits=tf.reshape(logits,[10,10]))
# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss,
train_op=train_op)
# Add evaluation metrics (for EVAL mode)
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=labels, predictions=predictions["classes"])}
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def load_images(path):
list_of_imgs = []
list_of_classes = []
for (dirpath1, dirnames1, filenames1) in os.walk(path):
for dir1 in dirnames1:
path1 = os.path.join(dirpath1, dir1)
for (dirpath2, dirnames2, filenames2) in os.walk(path1):
for dir2 in dirnames2:
path2 = os.path.join(dirpath2, dir2)
# for (dirpath3, dirnames3, filenames3) in
os.walk(dirpath2):
# for directorypath, directoryname in
zip(dirpath3,dirnames3):
for directoryname in os.listdir(path2):
if directoryname == "Mascara_Frames_Aislados":
directorypath = os.path.join(path2, directoryname)
directorypath = os.path.join(directorypath,
"crudas")
for img in os.listdir(directorypath):
img = os.path.join(directorypath, img)
if not img.endswith(".bmp"):
continue
a = ndimage.imread(img)
if a is None:
print ("Unable to read image: ", img)
continue
a = np.resize(a, [512, 512])
list_of_imgs.append(a.flatten())
list_of_classes.append(1)
elif directoryname == "FalsaAlarma":
directorypath = os.path.join(path2, directoryname)
directorypath = os.path.join(directorypath,
"crudas")
for img in os.listdir(directorypath):
img = os.path.join(directorypath, img)
if not img.endswith(".bmp"):
continue
a = ndimage.imread(img)
if a is None:
print ("Unable to read image: ", img)
continue
a = np.resize(a,[512,512])
list_of_imgs.append(a.flatten())
list_of_classes.append(0)
images = np.array(list_of_imgs, dtype="float16")
labels = np.array(list_of_classes, dtype="int32")
return images,labels
if __name__ == '__main__':
# Load training and eval data
# mnist = tf.contrib.learn.datasets.load_dataset("mnist")
# train_data = mnist.train.images # Returns np.array
# train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
train_data, train_labels =
load_images("C:\\Users\\Heads\\Desktop\\BDManchas_Semi")
eval_data = train_data.copy()
eval_labels = train_labels.copy()
# Create the Estimator
classifier = tf.estimator.Estimator(
model_fn=define_model, model_dir="/tmp/convnet_model")
# Set up logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=50)
# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": train_data},
y=train_labels,
batch_size=10,
num_epochs=None,
shuffle=True)
classifier.train(
input_fn=train_input_fn,
steps=100, #TODO estaba a 20000
hooks=[logging_hook])
# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data},
y=eval_labels,
num_epochs=1,
shuffle=False)
eval_results = classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)
问题是我收到错误:训练期间NaN丢失。如下:
C:\Users\Heads\AppData\Local\Programs\Python\Python35\python.exe
C:/Users/Heads/Desktop/TensorflowTests/test.py
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_every_n_hours': 10000,
'_task_id': 0, '_cluster_spec':
<tensorflow.python.training.server_lib.ClusterSpec object at
0x0000000007A56E48>, '_save_summary_steps': 100, '_train_distribute': None,
'_num_worker_replicas': 1, '_task_type': 'worker', '_keep_checkpoint_max': 5,
'_save_checkpoints_secs': 600, '_service': None, '_is_chief': True,
'_model_dir': '/tmp/convnet_model', '_global_id_in_cluster': 0,
'_log_step_count_steps': 100, '_tf_random_seed': None,
'_save_checkpoints_steps': None, '_evaluation_master': '', '_master': '',
'_num_ps_replicas': 0, '_session_config': None}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/convnet_model\model.ckpt-4
2018-05-23 16:51:08.096240: W
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101]
Allocation of 2147483648 exceeds 10% of system memory.
2018-05-23 16:51:11.786245: W
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101]
Allocation of 2147483648 exceeds 10% of system memory.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
2018-05-23 16:52:51.429447: W
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101]
Allocation of 2147483648 exceeds 10% of system memory.
2018-05-23 16:55:48.539695: W
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101]
Allocation of 1073741824 exceeds 10% of system memory.
2018-05-23 16:55:48.539695: W
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101]
Allocation of 1073741824 exceeds 10% of system memory.
INFO:tensorflow:Saving checkpoints for 5 into /tmp/convnet_model\model.ckpt.
INFO:tensorflow:probabilities = [[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]]
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
File "C:/Users/Heads/Desktop/TensorflowTests/test.py", line 155, in
<module>
hooks=[logging_hook])
...
File "C:\Users\Heads\AppData\Local\Programs\Python\Python35\lib\site-
packages\tensorflow\python\training\monitored_session.py", line 1199, in run
run_metadata=run_metadata))
File "C:\Users\Heads\AppData\Local\Programs\Python\Python35\lib\site-
packages\tensorflow\python\training\basic_session_run_hooks.py", line 623, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks. NanLossDuringTrainingError: NaN loss during training.
Process finished with exit code 1
我认为问题来自标签,它们与输入的形状不同(批次= 10)。我没有成功地重塑标签,并且logits和我也降低了学习率。此外,我尝试使用另一种计算损失的方法来避免NaN结果发布How to choose cross-entropy loss in tensorflow?,但没有解决方案。
答案 0 :(得分:1)
首先,您的代码存在一些问题:
您似乎只有2个班级(0
和1
),但您的网络有10个输出(c.f。logits = tf.layers.dense(inputs=dropout, units=10)
)。您的上一个dense
图层可能只有2个单元。
logits=tf.reshape(logits,[10,10]))
没有效果(因为之后您没有使用logits
)。它可以简单地删除。
其次,使用下面粘贴的模拟load_images()
,我没有在训练此模型时遇到任何错误(跟踪粘贴在下面)。因此,您的问题可能会出现:
load_images()
函数或您的数据集本身; /tmp/convnet_model
(c.f。classifier = tf.estimator.Estimator(model_fn=define_model, model_dir="/tmp/convnet_model")
中的模型,因为此模型可能已因先前的训练尝试而损坏。def load_images(path):
list_of_imgs = []
list_of_classes = []
dataset_size = 100
for i in range(dataset_size):
a = np.random.rand(512, 512)
label = np.random.randint(0, 2)
list_of_imgs.append(a.flatten())
list_of_classes.append(label)
images = np.array(list_of_imgs, dtype="float16")
labels = np.array(list_of_classes, dtype="int32")
return images,labels
跟踪:
...
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/convnet_model_2/model.ckpt.
INFO:tensorflow:probabilities = [[0.5137 0.4868]
[0.507 0.493 ]
[0.5054 0.4949]
[0.501 0.4993]
[0.4924 0.508 ]
[0.513 0.4868]
[0.539 0.461 ]
[0.5186 0.481 ]
[0.493 0.507 ]
[0.5103 0.4895]]
INFO:tensorflow:loss = 0.705, step = 1
...