在自然语言处理中,通常填充一批序列。 这是填充函数。
def pad_sequences(sequences, pad_tok=0):
"""
Args:
sequences: a generator of list or tuple
pad_tok: the char to pad with
Returns:
a list of list where each sublist has same length
a list record original length of sequences
"""
sequence_padded, sequence_length = [], []
sequence_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences,
padding='post', value=pad_tok)
max_sen_len = 0
for seq in sequences:
seq = list(seq)
if len(seq) > max_sen_len:
max_sen_len = len(seq)
return sequence_padded, max_sen_len
输入是一批可变长度的句子。 每个句子都是一个id列表,每个id代表词汇表中的一个单词。
这是feeddict
word_ids_left, maxlen_left = pad_sequences(word_ids_left)
pos1_ids_left, _ = pad_sequences(pos1_ids_left, pad_tok=499)
pos2_ids_left, _ = pad_sequences(pos2_ids_left, pad_tok=499)
word_ids_mid, maxlen_mid = pad_sequences(word_ids_mid)
pos1_ids_mid, _ = pad_sequences(pos1_ids_mid, pad_tok=499)
pos2_ids_mid, _ = pad_sequences(pos2_ids_mid, pad_tok=499)
word_ids_right, maxlen_right = pad_sequences(word_ids_right)
pos1_ids_right, _ = pad_sequences(pos1_ids_right, pad_tok=499)
pos2_ids_right, _ = pad_sequences(pos2_ids_right, pad_tok=499)
# build feed dictionary
feed = {
self.word_ids_left: word_ids_left,
self.pos1_ids_left: pos1_ids_left,
self.pos2_ids_left: pos2_ids_left,
self.word_ids_mid: word_ids_mid,
self.pos1_ids_mid: pos1_ids_mid,
self.pos2_ids_mid: pos2_ids_mid,
self.word_ids_right: word_ids_right,
self.pos1_ids_right: pos1_ids_right,
self.pos2_ids_right: pos2_ids_right,
self.maxlen_left: maxlen_left,
self.maxlen_mid: maxlen_mid,
self.maxlen_right: maxlen_right
}
我更喜欢在每批中构建具有动态最大句子长度的嵌入。
这是我的嵌入功能
def add_sentence_embeddings_op(self, word_ids, pos1_ids, pos2_ids, maxlen):
"""Defines sentence_embeddings
If self.config.embeddings is not None and is a np array initialized
with pre-trained word vectors, the word embeddings is just a look-up
and we don't train the vectors. Otherwise, a random matrix with
the correct shape is initialized.
"""
with tf.variable_scope("words", reuse=tf.AUTO_REUSE):
if self.config.embeddings is None:
self.logger.info("WARNING: randomly initializing word vectors")
_word_embeddings = tf.get_variable(
name="_word_embeddings",
dtype=tf.float32,
shape=[self.config.nwords, self.config.dim_word])
else:
_word_embeddings = tf.Variable(
self.config.embeddings,
name="_word_embeddings",
dtype=tf.float32,
trainable=self.config.train_word_embeddings)
word_embeddings = tf.nn.embedding_lookup(_word_embeddings, \
word_ids, name="word_embeddings")
with tf.variable_scope("pos1", reuse=tf.AUTO_REUSE):
self.logger.info("randomly initializing pos1 vectors")
_pos1_embeddings = tf.get_variable(
name="_pos1_embeddings",
dtype=tf.float32,
shape=[self.config.nposition, self.config.dim_pos])
pos1_embeddings = tf.nn.embedding_lookup(_pos1_embeddings, \
pos1_ids, name="pos1_embeddings")
with tf.variable_scope("pos2", reuse=tf.AUTO_REUSE):
self.logger.info("randomly initializing pos2 vectors")
_pos2_embeddings = tf.get_variable(
name="_pos2_embeddings",
dtype=tf.float32,
shape=[self.config.nposition, self.config.dim_pos])
pos2_embeddings = tf.nn.embedding_lookup(_pos2_embeddings, \
pos2_ids, name="pos2_embeddings")
word_emb_shape = word_embeddings.get_shape().as_list()
pos1_emb_shape = pos1_embeddings.get_shape().as_list()
pos2_emb_shape = pos2_embeddings.get_shape().as_list()
assert word_emb_shape[0] == pos1_emb_shape[0] == pos2_emb_shape[0]
assert word_emb_shape[1] == pos1_emb_shape[1] == pos2_emb_shape[1]
assert word_emb_shape[2] == self.config.dim_word
assert pos1_emb_shape[2] == self.config.dim_pos
assert pos2_emb_shape[2] == self.config.dim_pos
sentence_embeddings = tf.concat([word_embeddings, \
pos1_embeddings, pos2_embeddings], 2)
sen_emb_shape = sentence_embeddings.get_shape().as_list()
assert sen_emb_shape[2] == self.config.dim
# (batch_size, max length of sentences in batch, vector representation dimension, 1)
sentence_embeddings = tf.reshape(sentence_embeddings, [-1, maxlen, self.config.dim, 1])
return sentence_embeddings
不幸的是,tf.reshape步骤不起作用。
sentence_embeddings = tf.reshape(sentence_embeddings, [-1, maxlen, self.config.dim, 1])
编译时的错误。
Traceback (most recent call last):
File "train.py", line 26, in <module>
main()
File "train.py", line 12, in main
model.build()
File "/Users/randypen/Code/test_pcnn/model/pcnn_model.py", line 295, in build
self.add_concat_op()
File "/Users/randypen/Code/test_pcnn/model/pcnn_model.py", line 243, in add_concat_op
self.pos1_ids_left, self.pos2_ids_left, self.maxlen_left)
File "/Users/randypen/Code/test_pcnn/model/pcnn_model.py", line 202, in add_sentence_embeddings_op
sentence_embeddings = tf.reshape(sentence_embeddings, [-1, maxlen, self.config.dim, 1])
File "/Users/randypen/.virtualenvs/DataEnv/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 6113, in reshape
"Reshape", tensor=tensor, shape=shape, name=name)
File "/Users/randypen/.virtualenvs/DataEnv/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 528, in _apply_op_helper
(input_name, err))
ValueError: Tried to convert 'shape' to a tensor and failed. Error: Shapes must be equal rank, but are 1 and 0
From merging shape 1 with other shapes. for 'Reshape/packed' (op: 'Pack') with input shapes: [], [1], [], [].
我看到其他几个相关的回购,其中一些将最大长度定义为常数值。 是否有可能在张量流中重新形成一组动态最大长度的张量?