我有以下问题。
我正在尝试在tensorflow中训练一个3d CNN。我已将数据分为三个数据集,即训练,验证和测试。
主要问题是,当我在5个训练时段之后测试验证集时,模型的输出对于5个图像几乎相同。 (这是没有任何softmax的最后一层的输出)
2018-04-17 23:30:35.134318 Prediction: [[0.8185656 2.7571523 ]
[0.8200048 2.7590456 ]
[0.8185656 2.7571523 ]
[0.8200048 2.7590458 ]
[0.7751368 2.7532804 ]
[0.82061136 2.7588618 ]
[0.8130686 2.7821052 ]
[0.83537185 2.7514493 ]
[0.8200041 2.7590454 ]
[0.81701267 2.7519925 ]
[0.8424163 2.8674953 ]
[0.82000506 2.7590454 ]
[0.81999433 2.7590487 ]
[0.81701267 2.7519925 ]
但是,如果我为训练集做同样的事情,我会得到一个传统的预测。
我已经完全检查了数据集,两者都是正确的,并且条件相同。
这是我用来构建模型并进行培训的模式:
class Cnn3DMRI(object):
def weight_variable(self, shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(self, shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv3d(self, x, W):
return tf.nn.conv3d(x, W, strides=[1, 1, 1, 1, 1], padding='SAME')
def maxpool3d(self, x):
# size of window movement of window
return tf.nn.max_pool3d(x, ksize=[1, 2, 2, 2, 1], strides=[1, 2, 2, 2, 1], padding='SAME')
def dense_to_one_hot(self, labels_dense, num_classes):
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = np.arange(num_labels) * num_classes
labels_one_hot = np.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
def wrapper_image(self, full_image_set, full_label_set, last_batch=0, batch_size=5):
batch_img = full_image_set[last_batch:batch_size+last_batch, :, :, :]
batch_label = full_label_set[last_batch:batch_size+last_batch]
return batch_img, batch_label, batch_size+last_batch
def convolutional_neural_network(self, x, img_sz, n_slices):
weights = {
'W_conv1': self.weight_variable([3, 5, 5, 1, 32]),
'W_conv2': self.weight_variable([2, 5, 5, 32, 48]),
'W_fc': self.weight_variable(
[
int(
math.ceil(
n_slices / 8
) * math.ceil(
img_sz / 8
) * math.ceil(
img_sz / 8
) *48), 2048
]
),
'W_fc2': self.weight_variable([2048, 1024]),
'out': self.weight_variable([1024, 2])
}
biases = {
'b_conv1': self.bias_variable([32]),
'b_conv2': self.bias_variable([48]),
'b_fc': self.bias_variable([2048]),
'b_fc2': self.bias_variable([1024]),
'out': self.bias_variable([2])
}
self.x_im = tf.reshape(x, shape=[-1, n_slices, img_sz, img_sz, 1])
conv1 = tf.nn.relu(self.conv3d(self.x_im, weights['W_conv1']) + biases['b_conv1'])
conv1 = tf.Print(conv1,[conv1], 'The conv1: ')
conv1 =self.maxpool3d(conv1)
conv1 = tf.Print(conv1,[conv1], 'The max1: ')
conv2 = tf.nn.relu(self.conv3d(conv1, weights['W_conv2']) + biases['b_conv2'])
conv1 = tf.Print(conv2,[conv2], 'The conv2: ')
conv2 = tf.nn.max_pool3d(conv2, ksize=[1, 4, 4, 4, 1], strides=[1, 4, 4, 4, 1],
padding='SAME')
conv2 = tf.Print(conv2,[conv2], 'The max2: ')
fc = tf.reshape(conv2, [-1,int(math.ceil(n_slices/8)*math.ceil(img_sz/8)*math.ceil(
img_sz/8))*48])
fc = tf.Print(fc,[fc], 'The reshape: ')
fc2 = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
fc2 = tf.Print(fc2,[fc2], 'The fc: ')
dp1 = tf.nn.dropout(fc2, self.keep_prob)
fc3 = tf.nn.relu(tf.matmul(dp1, weights['W_fc2'])+biases['b_fc2'])
fc3 = tf.Print(fc3,[fc3], 'The fc2: ')
dp2 = tf.nn.dropout(fc3, self.keep_prob)
output = tf.matmul(dp2, weights['out'])+biases['out']
output = tf.Print(output,[output], 'The output: ')
return output
def test_validation_set(self, sess, data_validation, label_validation, valid_batch_size=60):
batch_img, batch_label, last_batch = self.wrapper_image(
data_validation, label_validation, self.last_valid_batch, valid_batch_size
)
batch_label = self.dense_to_one_hot(
np.array(batch_label, dtype=np.int),2
).astype(np.float32)
if last_batch+valid_batch_size < len(label_validation):
self.last_valid_batch = last_batch
else:
self.last_valid_batch = 0
pred, c, validation_accuracy = sess.run(
[self.prediction, self.cost, self.accuracy], feed_dict={
self.x: batch_img, self.y_: batch_label, self.keep_prob: 1.0
}
)
self.log("Prediction: "+str(pred))
self.log("Label: "+str(batch_label))
self.log("Validation accuracy: "+str(validation_accuracy))
self.log("Validation cost: "+str(c))
return validation_accuracy, c
def train_neural_network(self, data_img, labels, data_validation, label_validation,
batch_size, img_sz, n_slices, last_batch,
keep_rate, model_path):
self.prediction = self.convolutional_neural_network(self.x, img_sz, n_slices)
self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.y_,
logits=self.prediction))
optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.cost)
correct_prediction = tf.equal(tf.argmax(self.prediction, 1), tf.argmax(self.y_, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
hm_epochs = 1000
saver = tf.train.Saver(tf.trainable_variables())
epoch_loss = 0
epoch_loss_mean = []
n_epoch = 0
learning_rate = 1e-4
self.last_valid_batch = 0
min_valid_cost = 0
all_valid_cost = []
model_path_train = 'model_train/my_model.ckpt'
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
if model_path:
pass
#saver.restore(sess, model_path_train)
while n_epoch < hm_epochs:
if len(data_img)>last_batch+batch_size:
with tf.device('/cpu:0'):
#batch_img, batch_label, last_batch = self.get_image(
# data_img, labels, last_batch, batch_size, img_sz, n_slices
#)
batch_img, batch_label, last_batch = self.wrapper_image(data_img, labels, last_batch, batch_size)
print "Batch label images: "+str(batch_label)
batch_label = self.dense_to_one_hot(np.array(batch_label, dtype=np.int),
2).astype(np.float32)
else:
with tf.device('/cpu:0'):
restbatch = last_batch + batch_size - len(data_img)
batch_img = np.concatenate((
self.wrapper_image(data_img, labels, last_batch, len(data_img) -
last_batch)[0],
self.wrapper_image(data_img, labels, last_batch, len(data_img) -
last_batch)[0]
))
batch_label = np.concatenate((
self.wrapper_image(data_img, labels, last_batch, len(data_img) -
last_batch)[1],
self.wrapper_image(data_img, labels, last_batch, len(data_img) -
last_batch)[1]
))
batch_label = self.dense_to_one_hot(np.array(batch_label, dtype=np.int),
2).astype(
np.float32)
last_batch = restbatch
####### at the end of EACH EPOCH ###
epoch_loss_mean.append(epoch_loss)
print "epoch loss mean: "+str(epoch_loss_mean)
epoch_loss = 0
n_epoch += 1
print "n_epoch: "+str(n_epoch)
if model_path:
saver.save(sess, model_path_train)
if not n_epoch % 5:
valid_accuracy, valid_cost = self.test_validation_set(sess,data_validation,
label_validation, 60)
if valid_cost < min_valid_cost - 2:
min_valid_cost = valid_cost
if model_path:
saver.save(sess, model_path)
all_valid_cost.append(valid_cost)
print all_valid_cost
if self.last_valid_batch == 0:
self.shufle_data(data_validation, label_validation)
train_accuracy = self.accuracy.eval(
feed_dict={self.x: batch_img, self.y_: batch_label, self.keep_prob: 1.0})
print "trainning accuracy: " + str(train_accuracy)
self.shufle_data(data_img, labels)
_, c, pred = sess.run(
[optimizer, self.cost,], feed_dict={
self.x: batch_img, self.y_: batch_label, self.keep_prob: keep_rate,
self.learning_rate: learning_rate
}
)
print 'epoch_loss: '+str(c)
def main(self, data_dir, labels_dir, img_sz, n_slices, batch_size=5, last_batch=0, train=False,
model_path=None, keep_rate=0.5):
"""
Args:
data_dir(list): directories of the image to be tested
labels_dir: (str): directory of the csv file where the image are labeled, the index
colum is the number 2 and the labels header is 'Diag'.
img_sz: the spatial image size the be transformed to. that is the sizes with which
the image will be trainned. width and hight must be the same
n_slices: the number of slices for the image to be trained
last_batch: the batch at which you want to start the trainning
train: boolean to set trainning: 0 or testing :1
model_path: the path where the model is saved, if there is no previous model you can
set a path here to start a new one.
keep_rate: the keep_probability of firing a node by means of dropout
Returns:
"""
self.train = train
data_path_trainning, label_trainning, data_path_validation, label_validation, \
data_testing, label_testing = self.load_dataset(data_dir, labels_dir,)
data_trainning, label_trainning_final = self.load_image(data_path_trainning,
label_trainning, img_sz, n_slices
)
data_validation, label_validation_final = self.load_image(
data_path_validation, label_validation, img_sz, n_slices
)
self.x = tf.placeholder(tf.float32, shape=[None, n_slices, img_sz, img_sz]) #batch_size,
# image_Size
self.y_ = tf.placeholder(tf.float32, shape=[None, 3]) #batch_size, label_size
self.learning_rate = tf.placeholder(tf.float32)
self.keep_prob = tf.placeholder(tf.float32)
if train:
self.train_neural_network(data_trainning, label_trainning_final, data_validation,
label_validation_final, batch_size, img_sz, n_slices,
last_batch, keep_rate, model_path
)
我已经尝试过tf.set_random_seed(1),但没有看到更正
有人有任何想法吗?
非常感谢
EDITED 22/04/18:
要分类的数据是双层问题中150x150x40像素的3d图像。我总共有400张图像大约是每个类别的一半。我已将数据集分列(75%),验证(10%)和测试(15%)
EDIT2: 我简化了一下我的模型。看到了
另请注意,我们只有2个班级
我已经尝试了另一项检查,我只用20张图像训练我的模型。查看是否获得0成本。
125个时期后的结果:
2018年4月24日23:58:24.992421划时代损失平均:[4549.9554141853,1854.6537470817566,817.4076923541704,686.8368729054928,687.7348744268759,704.946801304817,483.6952783479355,260.2293045549304,272.66821688037817,116.57515235748815,97.86094704543848,90.43152131629176,132.54018089070996,69.62595339218387,57.412255316681694,79.66184640157735, 70.99515068903565,55.75798599421978,44.14403077028692,38.901107819750905,49.75594720244408,52.6321079954505,37.70595762133598,42.07099115010351,29.01994925737381,28.365123450756073,31.93120799213648,43.9855432882905,33.242121398448944,36.57513061538339,28.828659534454346,29.847569406032562,24.078316539525986,31.630925316363573,30.5430103354156,26.18060240149498,32.86780231446028,25.42889341711998,29.355055704712868,26.269534677267075,24.921810917556286, 27.15281054377556,27.343381822109222,24.293660208582878,28.212179094552994,25.07626649737358,21.650991335511208,257527906447649,23.42476052045822,28.350880 563259125,22.57907184958458,21.601420983672142,25.28128480911255,25.550641894340515,22.444457232952118,27.660063683986664,21.863914296031,25.722180172801018,24.00674758851528,21.46472266316414,26.599679857492447,23.52132275700569,26.1786640137434,24.842691332101822,25.263965144753456,22.730938494205475,22.787407517433167,23.58866274356842,25.351682364940643,23.85272353887558,23.884423837065697,24.685379207134247,22.55106496810913,25.993630707263947, 21.967322662472725,22.651918083429337,21.91003155708313,23.782021015882492,21.567724645137787,22.130879193544388,21.33636975288391,25.624440014362335,23.26347705721855,22.370914071798325,22.614411562681198,24.962509214878082,22.121410965919495,20.644148647785187,24.472172617912292,21.622991144657135,21.719978988170624,21.72349101305008,21.729621797800064,22.090826153755188,21.44688707590103,22.34817299246788,22.93226248025894,22.63547444343567,22.1306095123291, 22.16277289390564,22.8377110362 0529,24.171751350164413,22.025538682937622,21.339059710502625,22.169043481349945,24.614955246448517,22.83159503340721,21.43451902270317,21.54544973373413,22.889380514621735,24.168621599674225,21.947510302066803,22.30243694782257,22.381454586982727,22.50485634803772,22.61657750606537,22.288170099258423,21.30070123076439,22.489792048931122,21.885000944137573,21.343613982200623,23.04211688041687,24.00969059765339,21.8588485121727,22.199619591236115] 2018-04-24 23:58:24.992694 n_epoch:125
每层的打印输出:
conv1:[[[[[[0.0981627107 0.100793235 0.0934509188]]]]]]
max1:[[[[[[[0.102978 0.107030481 0.0977560952]]]]]]
max2:[[[[[0 0 0.00116439909]]]] ...]
重塑:[[0 0 0.00116439909] ...]
fc:[[0.01167579 0.182256863 0.107154548] ...]
fc2:[[0.773868561 0.364259362 0] ...]
输出:[[0.16590938 -0.255491495] [0.16590938] ...]
conv1:[[[[[[[0.0981602222 0.100800745 0.0934513509]]]]]]
max1:[[[[[[[0.102975294 0.107038349 0.0977560282]]]]]]
max2:[[[[[0 0 0.000874094665]]]] ...]
重塑:[[0 0 0.000874094665] ...]
fc:[[0.0117974132 0.182980478 0.106876813] ...]
fc2:[[0.774896204 0.36372292 0] ...]
输出:[[0.129838273 -0.210624188] [0.129838273] ...]
难道不应该有125个时代来装配60个样本吗?
有关正在发生的事情的任何想法?
答案 0 :(得分:0)
这更像是一条不符合评论限制的评论。
正如我之前所说,我看不出任何明显的错误。您可能需要进行一些调试。如果pre-softmax输出完全相同,那么它可能是某个地方的错误,您可以通过找到可能不同输入导致相同图层输出的确切位置来找到它。
如果pre-softmax输出接近但不完全相同,则很可能会出现过度拟合的经典问题。你提到你只有300个训练样例 - 很少用于训练整个网络(不使用一些预训练的重量)。您的网络只是“记住”了300个培训示例,并未完全归结为验证集。
编辑04/23/18: 那么,问题不只是在验证中?我解释了你的“如果我为训练集做同样的事情我会得到一个传统的预测。”意味着训练图像被分类得很好。如果您对训练图像获得相同的预测,则很可能数据或丢失或预测计算是错误的。我没有发现任何东西,猜测你需要调试。您可能会发现“急切执行”对此有用 - https://www.tensorflow.org/get_started/eager。如果您按照示例(https://github.com/tensorflow/tensorflow/tree/3f4662e7ca8724f760db4a5ea6e241c99e66e588/tensorflow/contrib/eager/python/examples)组织模型,则应该能够使用与常规张量流图执行相同的代码。