在亚马逊上的P2xlarge GPU上运行时不断出现内存错误

时间:2017-05-10 15:42:29

标签: python memory tensorflow

我正在尝试使用Tensorflow训练一个简单的神经网络。我已经在mnist数据集上运行了一个类似的网络没有问题,但是当我将代码调整到我的数据并尝试在GPU计算机上运行时,我一直在耗尽内存。 我已经尝试过了: - 减少batch_size - 训练几个时代 - 评论课程的一部分 - 运行少量图像的代码(10张图像而不是~75k)

以下是我的网络代码(不是由我制作)来获取正确格式的数据,你能发现任何可能会耗尽代码内存的东西吗?

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import random
import tensorflow as tf 
import gzip
import os
import random
import glob
import csv
import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin

class DataSet(object):
  def __init__(self, images, labels, fake_data=False, one_hot=False):
    """Construct a DataSet. one_hot arg is used only if fake_data is true."""
    if fake_data:
      self._num_examples = 10000
      self.one_hot = one_hot
    else:
      assert images.shape[0] == labels.shape[0], (
          'images.shape: %s labels.shape: %s' % (images.shape,
                                                 labels.shape))
      self._num_examples = images.shape[0]
      # This part is commented out because I kept getting memory exhaustion when using the big dataset ~75k images (224,224,3)
      # Convert shape from [num examples, rows, columns, depth]
      # to [num examples, rows*columns] (assuming depth == 1)
      # assert images.shape[3] == 3
      # images = images.reshape(images.shape[0],
      #                         images.shape[1] * images.shape[2] * images.shape[3])
      # # Convert from [0, 255] -> [0.0, 1.0].
      # images = images.astype(np.float32)
      # images = np.multiply(images, 1.0 / 255.0)
    self._images = images
    self._labels = labels
    self._epochs_completed = 0
    self._index_in_epoch = 0
  @property
  def images(self):
    return self._images
  @property
  def labels(self):
    return self._labels
  @property
  def num_examples(self):
    return self._num_examples
  @property
  def epochs_completed(self):
    return self._epochs_completed
  def next_batch(self, batch_size, fake_data=False):
    """Return the next `batch_size` examples from this data set."""
    if fake_data:
      fake_image = [1] * 784
      if self.one_hot:
        fake_label = [1] + [0] * 9
      else:
        fake_label = 0
      return [fake_image for _ in xrange(batch_size)], [
          fake_label for _ in xrange(batch_size)]
    start = self._index_in_epoch
    self._index_in_epoch += batch_size
    if self._index_in_epoch > self._num_examples:
      # Finished epoch
      self._epochs_completed += 1
      # Shuffle the data
      perm = np.arange(self._num_examples)
      np.random.shuffle(perm)
      self._images = self._images[perm]
      self._labels = self._labels[perm]
      # Start next epoch
      start = 0
      self._index_in_epoch = batch_size
      assert batch_size <= self._num_examples
    end = self._index_in_epoch
    return self._images[start:end], self._labels[start:end]

def read_data_sets(train_data, train_labels, test_data, test_labels,fake_data=False, one_hot=False):
  class DataSets(object):
    pass
  data_sets = DataSets()
  if fake_data:
    data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot)
    data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot)
    data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot)
    return data_sets
  print('Training')
  print(train_data.shape)
  print(train_labels.shape)
  print('Test')
  print(test_data.shape)
  print(test_labels.shape)
  data_sets.train = DataSet(train_data, train_labels)
  data_sets.test = DataSet(test_data, test_labels)
  return data_sets

def randomize(a, b):
  assert len(a) == len(b)
  # Generate the permutation index array.
  permutation = np.random.permutation(a.shape[0])
  # Shuffle the arrays by giving the permutation in the square brackets.
  shuffled_a = a[permutation]
  shuffled_b = b[permutation]
  return shuffled_a, shuffled_b

training_images = np.load('data_small/training_images.npy')
training_labels = np.load('data_small/training_labels.npy')
test_images = np.load('data_small/test_images.npy')
test_labels = np.load('data_small/test_labels.npy')

training_images, training_labels = randomize(training_images, training_labels)

avec = read_data_sets(training_images, training_labels, test_images, test_labels)

batch_size = 1 #53
print ('The batch size is: ',batch_size)

images = tf.placeholder(tf.float32, [None, 224*224*3])
# Kept getting a error when I initially set placeholder as [-1,224,224,3]
images = tf.reshape(images, [-1,224,224,3])
labels = tf.placeholder(tf.float32, [None, 1])
keep_rate = 0.8
keep_prob = tf.placeholder(tf.float32)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')

def maxpool2d(x):
    #                        size of window         movement of window
    return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

weights = {'W_conv1':tf.Variable(tf.truncated_normal([3,3,3,64], stddev=1e-4)),
           'W_conv2':tf.Variable(tf.truncated_normal([3,3,64,64], stddev=1e-4)),
           'W_conv3':tf.Variable(tf.truncated_normal([3,3,64,128], stddev=1e-4)),
           'W_conv4':tf.Variable(tf.truncated_normal([3,3,128,128], stddev=1e-4)),
           'W_conv5':tf.Variable(tf.truncated_normal([3,3,128,256], stddev=1e-4)),
           'W_conv6':tf.Variable(tf.truncated_normal([3,3,256,256], stddev=1e-4)),
           'W_conv7':tf.Variable(tf.truncated_normal([3,3,256,256], stddev=1e-4)),
           'W_fc':tf.Variable(tf.truncated_normal([28*28*256,4096], stddev=1e-4)),
           'W_fc2':tf.Variable(tf.truncated_normal([4096,2622], stddev=1e-4)),
           'reg':tf.Variable(tf.truncated_normal([2622,1], stddev=1e-4))}

biases = {'b_conv1':tf.Variable(tf.constant(0.1, shape=[64])),
           'b_conv2':tf.Variable(tf.constant(0.1, shape=[64])),
           'b_conv3':tf.Variable(tf.constant(0.1, shape=[128])),
           'b_conv4':tf.Variable(tf.constant(0.1, shape=[128])),
           'b_conv5':tf.Variable(tf.constant(0.1, shape=[256])),
           'b_conv6':tf.Variable(tf.constant(0.1, shape=[256])),
           'b_conv7':tf.Variable(tf.constant(0.1, shape=[256])),
           'b_fc':tf.Variable(tf.constant(0.1, shape=[4096])),
           'b_fc2':tf.Variable(tf.constant(0.1, shape=[2622])),
           'b_reg':tf.Variable(tf.constant(0.1, shape=[1]))}

conv1 = tf.nn.relu(conv2d(images, weights['W_conv1']) + biases['b_conv1'])

conv1 = tf.Print(conv1, [conv1], "conv1: ")
conv2 = tf.nn.relu(conv2d(conv1, weights['W_conv2']) + biases['b_conv2'])
conv2 = maxpool2d(conv2)
conv2 = tf.Print(conv2, [conv2], "conv2: ")

conv3 = tf.nn.relu(conv2d(conv2, weights['W_conv3']) + biases['b_conv3'])
conv3 = tf.Print(conv3, [conv3], "conv3: ")

conv4 = tf.nn.relu(conv2d(conv3, weights['W_conv4']) + biases['b_conv4'])
conv4 = maxpool2d(conv4)
conv4 = tf.Print(conv4, [conv4], "conv4: ")

conv5 = tf.nn.relu(conv2d(conv4, weights['W_conv5']) + biases['b_conv5'])
conv5 = tf.Print(conv5, [conv5], "conv5: ")

conv6 = tf.nn.relu(conv2d(conv5, weights['W_conv6']) + biases['b_conv6'])
conv6 = tf.Print(conv6, [conv6], "conv6: ")

conv7 = tf.nn.relu(conv2d(conv6, weights['W_conv7']) + biases['b_conv7'])
conv7 = maxpool2d(conv7)
conv7 = tf.Print(conv7, [conv7], "conv7: ")

fc = tf.reshape(conv7,[-1, 28*28*256])
fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
fc = tf.nn.dropout(fc, keep_rate)

fc2 = tf.matmul(fc, weights['W_fc2'])+biases['b_fc2']
fc2 = tf.nn.dropout(fc2, keep_rate)

pred = tf.add(tf.matmul(fc2, weights['reg']), biases['b_reg'])

loss = tf.reduce_mean(tf.square(pred-labels))
opt = tf.train.RMSPropOptimizer(0.001)
train_op = opt.minimize(loss)
hm_epochs = 5
print ('Total epochs: ', hm_epochs)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
  print('Begin session')
  sess.run(init_op) #initializea all variables
  for epoch in range(hm_epochs):
    print('Begin epoch:',epoch)
    epoch_loss = 0
    for _ in range (int(avec.train.num_examples/batch_size)):
      #batcha myndum og labels
      np_images, np_labels = avec.train.next_batch(batch_size)
      print('np_images shape:',np_images.shape)
      print('np_labels shape:',np_labels.shape)
      #set batchinn inn i feed_dictid mitt
      feed = {images: np_images, labels: np_labels}
      # the training step, run the loss, pred and train_op and the data is fed with the feed_dict
      np_loss, np_pred, _ = sess.run([loss, pred, train_op], feed_dict = feed)
      print('np_labels:',np_labels)
      print('np_pred:',np_pred)
      print('np_loss:',np_loss)

      epoch_loss += np_loss
    print ('Epoch', epoch+1, 'completed out of', hm_epochs, 'loss: ', epoch_loss/(avec.train.num_examples/batch_size))
  #save_path = saver.save(sess, "model1.ckpt")
  #print("Model saved in file: %s" % save_path)

这是产生的错误,看起来他将大部分内存分配给三个块(总共6.12gb中的两个和总共4.33gb中的一个)

W tensorflow/core/common_runtime/bfc_allocator.cc:274] *****************************************************************************************xxxxxxxxxxx
W tensorflow/core/common_runtime/bfc_allocator.cc:275] Ran out of memory trying to allocate 1.0KiB.  See logs for memory state.
W tensorflow/core/framework/op_kernel.cc:993] Resource exhausted: OOM when allocating tensor with shape[256]
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (256):       Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (512):       Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (1024):      Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (2048):      Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (4096):      Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (8192):      Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (16384):     Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (32768):     Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (65536):     Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (131072):    Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (262144):    Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (524288):    Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (1048576):   Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (2097152):   Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (4194304):   Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (8388608):   Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (16777216):  Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (33554432):  Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (67108864):  Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (134217728):         Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:643] Bin (268435456):         Total Chunks: 0, Chunks in use: 0 0B allocated for chunks. 0B client-requested for chunks. 0B in use in bin. 0B client-requested in use in bin.
I tensorflow/core/common_runtime/bfc_allocator.cc:660] Bin for 1.0KiB was 1.0KiB, Chunk State:
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e0000 of size 1280
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e0500 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e0600 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e0700 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e0800 of size 512
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e0a00 of size 1024
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e0e00 of size 16384
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e4e00 of size 10496
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e7700 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e7800 of size 6912
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050e9300 of size 6912
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12050eae00 of size 147456
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x120510ee00 of size 147456
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1205132e00 of size 294912
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x120517ae00 of size 294912
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12051c2e00 of size 589824
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1205252e00 of size 589824
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12052e2e00 of size 1179648
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1205402e00 of size 1179648
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1205522e00 of size 2359296
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1205762e00 of size 2359296
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12059a2e00 of size 3288334336
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x12c99a2e00 of size 3288334336
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x138d9a2e00 of size 42958848
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x139029ae00 of size 42958848
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b92e00 of size 10496
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b95700 of size 10496
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b98000 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b98100 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b98200 of size 512
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b98400 of size 512
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b98600 of size 1024
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b98a00 of size 1024
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b98e00 of size 16384
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392b9ce00 of size 16384
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba0e00 of size 10496
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba3700 of size 10496
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba6000 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba6100 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba6200 of size 10496
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba8b00 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba8c00 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba8d00 of size 512
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba8f00 of size 512
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba9100 of size 1024
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba9500 of size 1024
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba9900 of size 1024
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ba9d00 of size 16384
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392badd00 of size 10496
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392bb0600 of size 256
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392bb0700 of size 6912
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392bb2200 of size 6912
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392bb3d00 of size 147456
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392bd7d00 of size 147456
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392bfbd00 of size 294912
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392c43d00 of size 294912
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392c8bd00 of size 589824
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392d1bd00 of size 589824
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392dabd00 of size 1179648
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392ecbd00 of size 1179648
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x1392febd00 of size 2359296
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x139322bd00 of size 2359296
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x139346bd00 of size 2359296
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x13936abd00 of size 2359296
I tensorflow/core/common_runtime/bfc_allocator.cc:678] Chunk at 0x13938ebd00 of size 4646899456
I tensorflow/core/common_runtime/bfc_allocator.cc:693]      Summary of in-use Chunks by size:
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 11 Chunks of size 256 totalling 2.8KiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 5 Chunks of size 512 totalling 2.5KiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 6 Chunks of size 1024 totalling 6.0KiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 1 Chunks of size 1280 totalling 1.2KiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 4 Chunks of size 6912 totalling 27.0KiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 7 Chunks of size 10496 totalling 71.8KiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 4 Chunks of size 16384 totalling 64.0KiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 4 Chunks of size 147456 totalling 576.0KiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 4 Chunks of size 294912 totalling 1.12MiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 4 Chunks of size 589824 totalling 2.25MiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 4 Chunks of size 1179648 totalling 4.50MiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 6 Chunks of size 2359296 totalling 13.50MiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 2 Chunks of size 42958848 totalling 81.94MiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 2 Chunks of size 3288334336 totalling 6.12GiB
I tensorflow/core/common_runtime/bfc_allocator.cc:696] 1 Chunks of size 4646899456 totalling 4.33GiB
I tensorflow/core/common_runtime/bfc_allocator.cc:700] Sum Total of in-use chunks: 10.55GiB
I tensorflow/core/common_runtime/bfc_allocator.cc:702] Stats:
Limit:                 11332668621
InUse:                 11332668416
MaxInUse:              11332668416
NumAllocs:                      65
MaxAllocSize:           4646899456

W tensorflow/core/common_runtime/bfc_allocator.cc:274] *****************************************************************************************xxxxxxxxxxx
W tensorflow/core/common_runtime/bfc_allocator.cc:275] Ran out of memory trying to allocate 1.0KiB.  See logs for memory state.
W tensorflow/core/framework/op_kernel.cc:993] Resource exhausted: OOM when allocating tensor with shape[256]

1 个答案:

答案 0 :(得分:0)

计算模型所需的内存:

INPUT: [224x224x3]    memory:  224*224*3=150K   weights: 0
CONV1: [224x224x64]   memory:  224*224*64=3.2M  weights: (3*3*3)*64 = 1,728
CONV2: [224x224x64]   memory:  224*224*64=3.2M   weights: (3*3*64)*64 = 36,864
POOL2: [112x112x64]   memory:  112*112*64=800K   weights: 0

CONV3: [112x112x128]  memory:  112*112*128=1.6M   weights: (3*3*64)*128 = 73,728
CONV4: [112x112x128]  memory:  112*112*128=1.6M   weights: (3*3*128)*128 = 147,456
POOL4: [56x56x128]    memory:  56*56*128=400K     weights: 0

CONV5: [56x56x256]   memory:  56*56*256=800K   weights: (3*3*128)*256 = 294,912
CONV6: [56x56x256]   memory:  56*56*256=800K   weights: (3*3*256)*256 = 589,824
CONV7: [56x56x256]   memory:  56*56*256=800K   weights: (3*3*256)*256 = 589,824
POOL7: [28x28x256]   memory:  28*28*256=200K   weights: 0

FC1: [1x1x4096]     memory:  4096  weights: 28*28*256*4096 = 822,083,584
FC2: [1x1x2622]     memory:  2622  weights: 4096*2622 = 10,739,712

TOTAL memory: 14M * 4 bytes ~= 60MB / image (only forward)
TOTAL memory: 2 * 14M * 4 bytes ~= 120MB / image (forward + backward)

Batch_Size = 128
TOTAL memory: 128 * 14M * 4 bytes ~= 8GB (batch-forward)

P2x大GPU我认为是12GB。所以,模型不能抛出这个错误。您可以在此处发布错误以了解可能的原因吗?

此外,这些文件的大小是多少:data_small / training_images.npy和data_small / testing_images.npy