根据数据集元素和查找表为估算器构造SparseTensors

时间:2018-10-02 18:49:49

标签: python tensorflow tensorflow-datasets tensorflow-estimator

我正在尝试创建一个tf.SparseTensor以馈入模型。我想为每个输入构造一个稀疏张量,然后逐行连接它们,然后对其进行批处理。输入是字符串,可通过contrib.lookup模块的查找表将其转换为索引。通过估计器范式和数据集API构建模型和输入管道。我的示例原型在TFRecord文件中,如下所示:

{'user_id': ['user123'],
 'item_id':['item123'],
 'label': [.12]}

我的input_fn看起来像这样:

def dataset_input_fn(tfrecord_pattern, batch_size,
                     n_epochs, user_lookup, item_lookup):

    def _make_sparse_features(features):
        keys_to_features = {
            'user_id': tf.FixedLenFeature([1], tf.string),
            'item_id': tf.FixedLenFeature([1], tf.string),
            'label': tf.FixedLenFeature([1], tf.float32)
         }
        features = tf.parse_single_example(features, keys_to_features)
        user_tensor = sparse_tensor_from_lookup(user_lookup, features['user_id'])

        item_tensor = sparse_tensor_from_lookup(item_lookup, features['item_id'])

        sp_inputs = [user_tensor, item_tensor]
        sparse_features = tf.sparse_concat(axis=0, values=sp_inputs)

        return sparse_features, features['label']

    dataset = tf.data.Dataset.from_tensor_slices(glob.glob(tfrecord_pattern))
    dataset = dataset.shuffle(buffer_size=1028)
    dataset = dataset.flat_map(tf.data.TFRecordDataset)
    dataset = dataset.map(_make_sparse_features)

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)

    return dataset

其中sparse_tensor_from_lookup的定义如下(我尝试过tf.SparseTensortf.SparseTensorValue):

def sparse_tensor_from_lookup(lookup, key_tensor):
    indices = lookup.lookup(key_tensor)
    sparse_tensor = tf.SparseTensorValue(indices=indices,
                                         values=tf.ones_like(indices),
                                         dense_shape=lookup.size())
    return sparse_tensor

在我的main训练/评估函数中实例化了查询表:

def main_training_and_eval_loop(model_function,
                            training_epochs=10000, batch_size=256,
                            max_training_steps=8000000, learning_rate=1e-2, l2_weight=1e-3,
                            reader_n_threads=2, parser_n_threads=2,
                            save_checkpoint_every_n_steps=2000, factor_order=2,
                            max_steps_without_decrease=20000, shuffle_buffer_size=100000,
                            **context):

    model_directory = '/tmp/'

    users_id_file_path = os.path.join(model_directory, 'vocab/users.txt')
    item_id_file_path = os.path.join(model_directory, 'vocab/items.txt')

    tfrecord_dir = 'airflow_tmp/tfrecords_backfill_2018-09-29T00:43:32.624007/'

    item_lookup = index_table_from_file(item_id_file_path,
                                              num_oov_buckets=1)
    user_lookup = index_table_from_file(users_id_file_path,
                                     num_oov_buckets=1)

    training_input_fn = functools.partial(dataset_input_fn,
                                          os.path.join(tfrecord_dir, '*.tfrecord'), 
                                          batch_size=batch_size,
                                          n_epochs=training_epochs,
                                          user_lookup=user_lookup,
                                          item_lookup=item_lookup)


... etc

我收到的错误似乎与试图构造所述张量时传递给tf.SparseTensor的值有关:

<ipython-input-5-ec313e9d7732> in _make_sparse_features(features)
 17         }
 18         features = tf.parse_single_example(features, keys_to_features)
---> 19         course_tensor = sparse_tensor_from_lookup(course_lookup, features['target_course_id'])
     20 
     21         course_tags = tf.sparse_tensor_to_dense(features['course_tags'], default_value='')

~/Code/learning-recommendations/airflow/dags/utils/tensorflow/sparse.py in sparse_tensor_from_lookup(lookup, key_tensor)
      6     sparse_tensor = tf.SparseTensor(indices=indices,
      7                                     values=tf.ones_like(indices),
----> 8                                     dense_shape=lookup.size())
      9     return sparse_tensor

/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/sparse_tensor.py in __init__(self, indices, values, dense_shape)
    132     self._dense_shape = dense_shape
    133 
--> 134     indices_shape = indices.get_shape().with_rank(2)
    135     values_shape = values.get_shape().with_rank(1)
    136     dense_shape_shape = dense_shape.get_shape().with_rank(1)

/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py in with_rank(self, rank)
    745       return self.merge_with(unknown_shape(ndims=rank))
    746     except ValueError:
--> 747       raise ValueError("Shape %s must have rank %d" % (self, rank))
    748 
    749   def with_rank_at_least(self, rank):

ValueError: Shape (1,) must have rank 2

但这不同于我在tf.SparseTensorValue中遇到的错误:

<ipython-input-5-ec313e9d7732> in _make_sparse_features(features)
     30 
     31         sp_inputs = [course_tensor, course_tags_tensor, user_tensor, user_interests_tensor]
---> 32         sparse_features = tf.concat(axis=0, values=sp_inputs)
     33 
     34         return sparse_features, features['label']

/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py in concat(values, axis, name)
   1112               tensor_shape.scalar())
   1113       return identity(values[0], name=scope)
-> 1114   return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
   1115 
   1116 

/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py in concat_v2(values, axis, name)
   1028     _attr_N = len(values)
   1029     _, _, _op = _op_def_lib._apply_op_helper(
-> 1030         "ConcatV2", values=values, axis=axis, name=name)
   1031     _result = _op.outputs[:]
   1032     _inputs_flat = _op.inputs

/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py in _apply_op_helper(self, op_type_name, name, **keywords)
    481                                 (prefix, dtype.name))
    482               else:
--> 483                 raise TypeError("%s that don't all match." % prefix)
    484             else:
    485               raise TypeError("%s that are invalid." % prefix)

TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [<NOT CONVERTIBLE TO TENSOR>, <NOT CONVERTIBLE TO TENSOR>] that don't all match.

我在这里没有水库吗?我一直在努力寻找有关如何使用数据集api来工作于此类工作流程的资料,而更难于寻找关于tf.SparseTensortf.SparseTensorValue之间的差异的资料,甚至很难找到有关稀疏张量的所有信息。

0 个答案:

没有答案