Question

在自然语言处理中，通常填充一批序列。这是填充函数。

def pad_sequences(sequences, pad_tok=0):
"""
Args:
    sequences: a generator of list or tuple
    pad_tok: the char to pad with

Returns:
    a list of list where each sublist has same length
    a list record original length of sequences

"""
sequence_padded, sequence_length = [], []
sequence_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences,
                                    padding='post', value=pad_tok)

max_sen_len = 0
for seq in sequences:
    seq = list(seq)
    if len(seq) > max_sen_len:
        max_sen_len = len(seq)

return sequence_padded, max_sen_len

输入是一批可变长度的句子。每个句子都是一个id列表，每个id代表词汇表中的一个单词。

这是feeddict

    word_ids_left, maxlen_left = pad_sequences(word_ids_left)
    pos1_ids_left, _ = pad_sequences(pos1_ids_left, pad_tok=499)
    pos2_ids_left, _ = pad_sequences(pos2_ids_left, pad_tok=499)

    word_ids_mid, maxlen_mid = pad_sequences(word_ids_mid)
    pos1_ids_mid, _ = pad_sequences(pos1_ids_mid, pad_tok=499)
    pos2_ids_mid, _ = pad_sequences(pos2_ids_mid, pad_tok=499)

    word_ids_right, maxlen_right = pad_sequences(word_ids_right)
    pos1_ids_right, _ = pad_sequences(pos1_ids_right, pad_tok=499)
    pos2_ids_right, _ = pad_sequences(pos2_ids_right, pad_tok=499)


    # build feed dictionary
    feed = {
        self.word_ids_left:  word_ids_left,
        self.pos1_ids_left:  pos1_ids_left,
        self.pos2_ids_left:  pos2_ids_left,
        self.word_ids_mid:   word_ids_mid,
        self.pos1_ids_mid:   pos1_ids_mid,
        self.pos2_ids_mid:   pos2_ids_mid,
        self.word_ids_right: word_ids_right,
        self.pos1_ids_right: pos1_ids_right,
        self.pos2_ids_right: pos2_ids_right,
        self.maxlen_left:    maxlen_left,
        self.maxlen_mid:     maxlen_mid,
        self.maxlen_right:   maxlen_right
    }

我更喜欢在每批中构建具有动态最大句子长度的嵌入。

这是我的嵌入功能

def add_sentence_embeddings_op(self, word_ids, pos1_ids, pos2_ids, maxlen):
    """Defines sentence_embeddings

    If self.config.embeddings is not None and is a np array initialized
    with pre-trained word vectors, the word embeddings is just a look-up
    and we don't train the vectors. Otherwise, a random matrix with
    the correct shape is initialized.
    """
    with tf.variable_scope("words", reuse=tf.AUTO_REUSE):
        if self.config.embeddings is None:
            self.logger.info("WARNING: randomly initializing word vectors")
            _word_embeddings = tf.get_variable(
                    name="_word_embeddings",
                    dtype=tf.float32,
                    shape=[self.config.nwords, self.config.dim_word])
        else:
            _word_embeddings = tf.Variable(
                    self.config.embeddings,
                    name="_word_embeddings",
                    dtype=tf.float32,
                    trainable=self.config.train_word_embeddings)

        word_embeddings = tf.nn.embedding_lookup(_word_embeddings, \
                word_ids, name="word_embeddings")


    with tf.variable_scope("pos1", reuse=tf.AUTO_REUSE):
        self.logger.info("randomly initializing pos1 vectors")
        _pos1_embeddings = tf.get_variable(
                name="_pos1_embeddings",
                dtype=tf.float32,
                shape=[self.config.nposition, self.config.dim_pos])

        pos1_embeddings = tf.nn.embedding_lookup(_pos1_embeddings, \
                pos1_ids, name="pos1_embeddings")

    with tf.variable_scope("pos2", reuse=tf.AUTO_REUSE):
        self.logger.info("randomly initializing pos2 vectors")
        _pos2_embeddings = tf.get_variable(
                name="_pos2_embeddings",
                dtype=tf.float32,
                shape=[self.config.nposition, self.config.dim_pos])

        pos2_embeddings = tf.nn.embedding_lookup(_pos2_embeddings, \
                pos2_ids, name="pos2_embeddings")

    word_emb_shape = word_embeddings.get_shape().as_list()
    pos1_emb_shape = pos1_embeddings.get_shape().as_list()
    pos2_emb_shape = pos2_embeddings.get_shape().as_list()
    assert word_emb_shape[0] == pos1_emb_shape[0] == pos2_emb_shape[0]
    assert word_emb_shape[1] == pos1_emb_shape[1] == pos2_emb_shape[1]
    assert word_emb_shape[2] == self.config.dim_word
    assert pos1_emb_shape[2] == self.config.dim_pos
    assert pos2_emb_shape[2] == self.config.dim_pos

    sentence_embeddings = tf.concat([word_embeddings, \
        pos1_embeddings, pos2_embeddings], 2)

    sen_emb_shape = sentence_embeddings.get_shape().as_list()
    assert sen_emb_shape[2] == self.config.dim
    # (batch_size, max length of sentences in batch, vector representation dimension, 1)
    sentence_embeddings = tf.reshape(sentence_embeddings, [-1, maxlen, self.config.dim, 1])
    return sentence_embeddings

不幸的是，tf.reshape步骤不起作用。

sentence_embeddings = tf.reshape(sentence_embeddings, [-1, maxlen, self.config.dim, 1])

编译时的错误。

Traceback (most recent call last):
  File "train.py", line 26, in <module>
    main()
  File "train.py", line 12, in main
    model.build()
  File "/Users/randypen/Code/test_pcnn/model/pcnn_model.py", line 295, in build
    self.add_concat_op()
  File "/Users/randypen/Code/test_pcnn/model/pcnn_model.py", line 243, in add_concat_op
    self.pos1_ids_left, self.pos2_ids_left, self.maxlen_left)
  File "/Users/randypen/Code/test_pcnn/model/pcnn_model.py", line 202, in add_sentence_embeddings_op
    sentence_embeddings = tf.reshape(sentence_embeddings, [-1, maxlen, self.config.dim, 1])
  File "/Users/randypen/.virtualenvs/DataEnv/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 6113, in reshape
    "Reshape", tensor=tensor, shape=shape, name=name)
  File "/Users/randypen/.virtualenvs/DataEnv/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 528, in _apply_op_helper
(input_name, err))
ValueError: Tried to convert 'shape' to a tensor and failed. Error: Shapes must be equal rank, but are 1 and 0
    From merging shape 1 with other shapes. for 'Reshape/packed' (op: 'Pack') with input shapes: [], [1], [], [].

我看到其他几个相关的回购，其中一些将最大长度定义为常数值。是否有可能在张量流中重新形成一组动态最大长度的张量？

重新形成一批具有动态最大长度（张量流量）的张量

0 个答案: