我正在尝试使用以下代码/教程执行二进制分类 https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/nlp_estimators.py
print("Loading data...")
(x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(num_words=vocab_size)
print(len(y_train), "train sequences")
print(len(y_test), "test sequences")
print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train_variable,
maxlen=sentence_size,
padding='post',
value=0)
x_test = sequence.pad_sequences(x_test_variable,
maxlen=sentence_size,
padding='post',
value=0)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
def train_input_fn():
dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
dataset = dataset.shuffle(buffer_size=len(x_train_variable))
dataset = dataset.batch(100)
dataset = dataset.map(parser)
dataset = dataset.repeat()
iterator = dataset.make_one_shot_iterator()
return iterator.get_next()
def eval_input_fn():
dataset = tf.data.Dataset.from_tensor_slices((x_test, x_len_test, y_test))
dataset = dataset.batch(100)
dataset = dataset.map(parser)
iterator = dataset.make_one_shot_iterator()
return iterator.get_next()
def cnn_model_fn(features, labels, mode, params):
input_layer = tf.contrib.layers.embed_sequence(
features['x'], vocab_size, embedding_size,
initializer=params['embedding_initializer'])
training = mode == tf.estimator.ModeKeys.TRAIN
dropout_emb = tf.layers.dropout(inputs=input_layer,
rate=0.2,
training=training)
conv = tf.layers.conv1d(
inputs=dropout_emb,
filters=32,
kernel_size=3,
padding="same",
activation=tf.nn.relu)
# Global Max Pooling
pool = tf.reduce_max(input_tensor=conv, axis=1)
hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)
dropout_hidden = tf.layers.dropout(inputs=hidden,
rate=0.2,
training=training)
logits = tf.layers.dense(inputs=dropout_hidden, units=1)
# This will be None when predicting
if labels is not None:
labels = tf.reshape(labels, [-1, 1])
optimizer = tf.train.AdamOptimizer()
def _train_op_fn(loss):
return optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return head.create_estimator_spec(
features=features,
labels=labels,
mode=mode,
logits=logits,
train_op_fn=_train_op_fn)
cnn_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
model_dir=os.path.join(model_dir, 'cnn'),
params=params)
train_and_evaluate(cnn_classifier)
此处的示例从IMDB电影评论加载数据。我有自己的文本形式的数据集,其大小约为2GB。现在在这个例子中
(x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(num_words=vocab_size)
尝试将整个数据集加载到内存中。如果我尝试执行相同的操作,则会耗尽内存。如何重组此逻辑以从磁盘批量读取数据?
答案 0 :(得分:1)
您要更改dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
行。创建数据集的方法有很多-from_tensor_slices
是最简单的,但是如果您无法将整个数据集加载到内存中,则无法单独使用。
最好的方法取决于您如何存储数据或如何存储/操作数据。在我看来,最简单的方法是减少副作用(除非在多个GPU上运行),除非原始数据集仅提供数据索引,然后编写一个普通的numpy函数以加载第i
个示例。
dataset = tf.data.Dataset.from_tensor_slices(tf.range(epoch_size))
def tf_map_fn(i):
def np_map_fn(i):
return load_ith_example(i)
inp1, inp2 = tf.py_func(np_map_fn, (i,), Tout=(tf.float32, tf.float32), stateful=False)
# other preprocessing/data augmentation goes here.
# unbatched sizes
inp1.set_shape(shape1)
inp2.set_shape(shape2)
return inp1, inp2
dataset = dataset.repeat().shuffle(epoch_size).map(tf_map_fn, 8)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(1) # start loading data as GPU trains on previous batch
inp1, inp2 = dataset.make_one_shot_iterator().get_next()
在这里,我假设您的输出是float32
张量(Tout=...
)。 set_shape
调用不是严格必要的,但是如果您知道形状,它将进行更好的错误检查。
只要预处理所需的时间不会比网络运行的时间长,它的运行速度就应与一台GPU机器上的任何其他方法一样快。
另一种明显的方法是将数据转换为tfrecords
,但这会占用更多的磁盘空间,如果您问我,则更难以管理。