我正在尝试创建一个tf.SparseTensor以馈入模型。我想为每个输入构造一个稀疏张量,然后逐行连接它们,然后对其进行批处理。输入是字符串,可通过contrib.lookup模块的查找表将其转换为索引。通过估计器范式和数据集API构建模型和输入管道。我的示例原型在TFRecord文件中,如下所示:
{'user_id': ['user123'],
'item_id':['item123'],
'label': [.12]}
我的input_fn
看起来像这样:
def dataset_input_fn(tfrecord_pattern, batch_size,
n_epochs, user_lookup, item_lookup):
def _make_sparse_features(features):
keys_to_features = {
'user_id': tf.FixedLenFeature([1], tf.string),
'item_id': tf.FixedLenFeature([1], tf.string),
'label': tf.FixedLenFeature([1], tf.float32)
}
features = tf.parse_single_example(features, keys_to_features)
user_tensor = sparse_tensor_from_lookup(user_lookup, features['user_id'])
item_tensor = sparse_tensor_from_lookup(item_lookup, features['item_id'])
sp_inputs = [user_tensor, item_tensor]
sparse_features = tf.sparse_concat(axis=0, values=sp_inputs)
return sparse_features, features['label']
dataset = tf.data.Dataset.from_tensor_slices(glob.glob(tfrecord_pattern))
dataset = dataset.shuffle(buffer_size=1028)
dataset = dataset.flat_map(tf.data.TFRecordDataset)
dataset = dataset.map(_make_sparse_features)
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
return dataset
其中sparse_tensor_from_lookup
的定义如下(我尝试过tf.SparseTensor
和tf.SparseTensorValue
):
def sparse_tensor_from_lookup(lookup, key_tensor):
indices = lookup.lookup(key_tensor)
sparse_tensor = tf.SparseTensorValue(indices=indices,
values=tf.ones_like(indices),
dense_shape=lookup.size())
return sparse_tensor
在我的main
训练/评估函数中实例化了查询表:
def main_training_and_eval_loop(model_function,
training_epochs=10000, batch_size=256,
max_training_steps=8000000, learning_rate=1e-2, l2_weight=1e-3,
reader_n_threads=2, parser_n_threads=2,
save_checkpoint_every_n_steps=2000, factor_order=2,
max_steps_without_decrease=20000, shuffle_buffer_size=100000,
**context):
model_directory = '/tmp/'
users_id_file_path = os.path.join(model_directory, 'vocab/users.txt')
item_id_file_path = os.path.join(model_directory, 'vocab/items.txt')
tfrecord_dir = 'airflow_tmp/tfrecords_backfill_2018-09-29T00:43:32.624007/'
item_lookup = index_table_from_file(item_id_file_path,
num_oov_buckets=1)
user_lookup = index_table_from_file(users_id_file_path,
num_oov_buckets=1)
training_input_fn = functools.partial(dataset_input_fn,
os.path.join(tfrecord_dir, '*.tfrecord'),
batch_size=batch_size,
n_epochs=training_epochs,
user_lookup=user_lookup,
item_lookup=item_lookup)
... etc
我收到的错误似乎与试图构造所述张量时传递给tf.SparseTensor
的值有关:
<ipython-input-5-ec313e9d7732> in _make_sparse_features(features)
17 }
18 features = tf.parse_single_example(features, keys_to_features)
---> 19 course_tensor = sparse_tensor_from_lookup(course_lookup, features['target_course_id'])
20
21 course_tags = tf.sparse_tensor_to_dense(features['course_tags'], default_value='')
~/Code/learning-recommendations/airflow/dags/utils/tensorflow/sparse.py in sparse_tensor_from_lookup(lookup, key_tensor)
6 sparse_tensor = tf.SparseTensor(indices=indices,
7 values=tf.ones_like(indices),
----> 8 dense_shape=lookup.size())
9 return sparse_tensor
/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/sparse_tensor.py in __init__(self, indices, values, dense_shape)
132 self._dense_shape = dense_shape
133
--> 134 indices_shape = indices.get_shape().with_rank(2)
135 values_shape = values.get_shape().with_rank(1)
136 dense_shape_shape = dense_shape.get_shape().with_rank(1)
/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py in with_rank(self, rank)
745 return self.merge_with(unknown_shape(ndims=rank))
746 except ValueError:
--> 747 raise ValueError("Shape %s must have rank %d" % (self, rank))
748
749 def with_rank_at_least(self, rank):
ValueError: Shape (1,) must have rank 2
但这不同于我在tf.SparseTensorValue
中遇到的错误:
<ipython-input-5-ec313e9d7732> in _make_sparse_features(features)
30
31 sp_inputs = [course_tensor, course_tags_tensor, user_tensor, user_interests_tensor]
---> 32 sparse_features = tf.concat(axis=0, values=sp_inputs)
33
34 return sparse_features, features['label']
/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py in concat(values, axis, name)
1112 tensor_shape.scalar())
1113 return identity(values[0], name=scope)
-> 1114 return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
1115
1116
/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py in concat_v2(values, axis, name)
1028 _attr_N = len(values)
1029 _, _, _op = _op_def_lib._apply_op_helper(
-> 1030 "ConcatV2", values=values, axis=axis, name=name)
1031 _result = _op.outputs[:]
1032 _inputs_flat = _op.inputs
/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py in _apply_op_helper(self, op_type_name, name, **keywords)
481 (prefix, dtype.name))
482 else:
--> 483 raise TypeError("%s that don't all match." % prefix)
484 else:
485 raise TypeError("%s that are invalid." % prefix)
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [<NOT CONVERTIBLE TO TENSOR>, <NOT CONVERTIBLE TO TENSOR>] that don't all match.
我在这里没有水库吗?我一直在努力寻找有关如何使用数据集api来工作于此类工作流程的资料,而更难于寻找关于tf.SparseTensor
和tf.SparseTensorValue
之间的差异的资料,甚至很难找到有关稀疏张量的所有信息。