我必须使用tf.data.Dataset
为Tensorflow中的RNN模型创建输入管道。我正在提供一个基本代码,通过该代码我需要使用填充令牌批量填充数据,并将其用于进一步的操作。
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
total_data_size = 10000
embedding_dimension = 25
max_len = 17
varying_length = np.random.randint(max_len, size=(10000)) # varying length data
X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length
Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary
embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding
def gen():
for index in range(len(X)):
yield X[index] , Y[index]
dataset = tf.data.Dataset.from_generator(gen,(tf.int32,tf.int32))
dataset = dataset.batch(batch_size=25)
padded_shapes = (tf.TensorShape([None])) # sentence of unknown size
padding_values = (tf.constant(-111)) # the value with which pad index needs to be filled
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values)
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
print(sess.run(iter2.get_next()))
我希望代码能自我解释并带有注释。但是我遇到以下错误,
InvalidArgumentError (see above for traceback): Cannot batch tensors with different shapes in component 0. First element had shape [11] and element 1 had shape [12].
[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,?], [?]], output_types=[DT_INT32, DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Iterator)]]
答案 0 :(得分:1)
我相信,由于您的生成器产生两个输出,因此padded_shapes
和padded_values
元组的长度必须为两个。对我来说,这可行:
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32))
dataset = dataset.batch(batch_size=25)
padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None])) # sentence of unknown size
padding_values = (tf.constant(-111), tf.constant(-111)) # the value with which pad index needs to be filled
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values)
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
答案 1 :(得分:0)
最后得到了答案。问题是第二个填充形状而不是Tensorshape([None]),我们应该提供[],因为生成器返回的第二个项是标量。如果使用Tensorshape([None]),请确保我们返回的向量
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
total_data_size = 10000
embedding_dimension = 25
max_len = 17
varying_length = np.random.randint(max_len, size=(10000)) # varying length data
X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length
Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary
embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding
def gen():
for index in range(len(X)):
yield X[index] , Y[index]
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), (tf.TensorShape([None]), []))
padded_shapes = (tf.TensorShape([None]), []) # sentence of unknown size
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=(-111, 0))
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
sess.run(iter2.get_next())