我使用Tensorflow MLP来训练CIFAR 100 python数据集,但是当我执行代码时,有人可以帮助我将batch_ys送入y占位符并运行代码,我目前正在获取这个,我不确定是否还有更多,Windows 10说" Python已停止工作",这里是代码(8-3.py):
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from read import unpickle
dir = os.path.dirname(os.path.realpath(__file__))
from read_label import read_label
current_batch = 0
t1 = time.time()
# Load MNIST Data
from tensorflow.examples.tutorials.mnist import input_data
#mnist = input_data.read_data_sets(dir + "/MNIST_data/", one_hot=True)
# Learning Parameters
learning_rate = 0.001
training_epochs = 1500
batch_size = 5500
display_step = 1
# Network Parameters
n_hidden_1 = 1024 # 1st layer num features
n_hidden_2 = 1024 # 2nd layer num features
n_hidden_3 = 1024
n_hidden_4 = 1024
n_input = 3072 # MNIST data input (img shape: 28*28)
n_classes = 100 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder("float", [None, 3072])
y = tf.placeholder("float", [None, 100])
#weights layer 1
h = tf.Variable(tf.random_normal([n_input, n_hidden_1]))
#bias layer 1
bias_layer_1 = tf.Variable(tf.random_normal([n_hidden_1]))
#layer 1
layer_1 = tf.nn.relu(tf.add(tf.matmul(x,h),bias_layer_1))
#weights layer 2
w = tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2]))
#bias layer 2
bias_layer_2 = tf.Variable(tf.random_normal([n_hidden_2]))
#layer 2
layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1,w),bias_layer_2))
h1 = tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3]))
bias_layer_3 = tf.Variable(tf.random_normal([n_hidden_3]))
layer_3 = tf.nn.relu(tf.add(tf.matmul(layer_2, h1), bias_layer_3))
w1 = tf.Variable(tf.random_normal([n_hidden_3, n_hidden_4]))
bias_layer_4 = tf.Variable(tf.random_normal([n_hidden_4]))
layer_4 = tf.nn.relu(tf.add(tf.matmul(layer_3, w1), bias_layer_4))
#weights output layer
output = tf.Variable(tf.random_normal([n_hidden_4, n_classes]))
#bias output layer
bias_output = tf.Variable(tf.random_normal([n_classes]))
#output layer
output_layer = tf.matmul(layer_4, output) + bias_output
# cost function
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output_layer, y))
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output_layer, labels=y))
# optimizer
optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)
#Plot settings
avg_set = []
epoch_set=[]
# Initializing the variables
# init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.0
# trchou
total_batch = int(50000/batch_size)
# total_batch = 2000
# Loop over all batches
for i in range(total_batch):
# trchou
#batch_xs, batch_ys = mnist.train.next_batch(batch_size)
batch_xs = unpickle('train')[b'data'][current_batch:batch_size, :]
#batch_ys = read_label('train')[current_batch:batch_size]
batch_ys = tf.one_hot(read_label('train')[current_batch:batch_size], depth=100, dtype="float").eval()
print(x)
print(batch_ys)
#print(read_label('train').shape)
# Fit training using batch data
sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
# Compute average loss
avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch
current_batch += batch_size
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
avg_set.append(avg_cost)
epoch_set.append(epoch+1)
'''
if(cost == 0.000000000):
print("The cost value of this training has reached 0, exit? (y/n)")
a = input()
if(a == 'y'):
print("You chose to break it.")
break
elif(a == 'n'):
print("Training will continue.")
'''
t2 = time.time()
t_min = int((t2-t1)/60)
t_sec = int((t2-t1)%60)
print("Training phase finished, time elapsed {:d}min {:d}secs.".format(t_min, t_sec))
# Plot the learning curve
plt.plot(epoch_set,avg_set, 'o', label='MLP Training phase')
plt.ylabel('cost')
plt.xlabel('epoch')
plt.legend()
plt.show()
# Save the model after learning
model_saver = tf.train.Saver()
model_saver.save(sess, "C:/cifar-model/my_model_mlp.chkp")
# Testing cycle
correct_prediction = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Model accuracy:", accuracy.eval({x: unpickle('test')[b'data'], y: read_label('test')}))
'''
# Restore model & testing
with tf.Session() as sess:
model_saver.restore(sess, "C:/model-batchsize_55000_epoch_500_4_hiddens_learningrate_0.001/my_model_mlp.chkp")
print("Model restored.")
print("Initialized")
# Test model
correct_prediction = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Model accuracy:", accuracy.eval({x: batch_xs, y: batch_ys}))
'''
回溯:
C:\Users\Administrator\learn_tensorflow\cifar-100-python>python 8-3.py
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135]
successfully opened CUDA library cublas64_80.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135]
successfully opened CUDA library cudnn64_5.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135]
successfully opened CUDA library cufft64_80.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135]
successfully opened CUDA library nvcuda.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135]
successfully opened CUDA library curand64_80.dll locally
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "BestSplits" device_type: "CPU"') for unknown op: BestSplits
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "CountExtremelyRandomStats" device_type: "CPU"') for unknown op: CountExtremelyRandomStats
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "FinishedNodes" device_type: "CPU"') for unknown op: FinishedNodes
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "GrowTree" device_type: "CPU"') for unknown op: GrowTree
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "ReinterpretStringToFloat" device_type: "CPU"') for unknown op: ReinterpretStringToFloat
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "SampleInputs" device_type: "CPU"') for unknown op: SampleInputs
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "ScatterAddNdim" device_type: "CPU"') for unknown op: ScatterAddNdim
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel ('op: "TopNInsert" device_type: "CPU"') for unknown op: TopNInsert
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "TopNRemove" device_type: "CPU"') for unknown op: TopNRemove
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "TreePredictions" device_type: "CPU"') for unknown op: TreePredictions
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943]
OpKernel ('op: "UpdateFertileSlots" device_type: "CPU"') for unknown op: UpdateFertileSlots
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:885] Found device 0 with properties:
name: GeForce GT 730
major: 3 minor: 5 memoryClockRate (GHz) 0.9015
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.66GiB
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:906] DMA: 0
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:916] 0: Y
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 730, pci bus id: 0000:01:00.0)
<class 'list'>
Tensor("Placeholder:0", shape=(?, 3072), dtype=float32)
<class 'list'>
Tensor("Placeholder:0", shape=(?, 3072), dtype=float32)
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\bfc_allocator.cc:244] tried to allocate 0 bytes
W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\allocator_retry.cc:32] Request to allocate 0 bytes
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:104] EigenAllocator for GPU ran out of memory when allocating 0. See error logs for more detailed info.
C:\Users\Administrator\learn_tensorflow\cifar-100-python>
答案 0 :(得分:0)
内存不足,这意味着您尝试在每一步传递过多数据。
这是因为你的batch_size
太高,尝试小值(比方说32)并查看它是否有效。出于性能原因,您可以在之后尝试更高的值。
使用批处理以通过执行并行操作来提高性能。换句话说,您加载更多数据以加快速度。但是加载更多数据会产生内存成本,您必须将其加载到GPU RAM中。
您有两种情况:
batch_size
batch_size
如果你到达batch_size=1
仍然触发OOM的点,那么你的GPU上没有足够的RAM:
答案 1 :(得分:0)
尝试注释掉model_saver.save(sess, "C:/cifar-model/my_model_mlp.chkp")
行并进行检查。
我在使用saver.save
选项时遇到了同样的问题,但如果没有该选项,它将运行良好。