的特征或{{1}的上下文而输入}。 (请参阅底部的问题。)
即它是长度固定为 的2通道序列(在此示例中为2),其中值只能是整数值。
seq = [
# el1, el2
[ 0, 1 ], # channel 1
[ 0, 1 ] # channel 2
实际上cls_probs = [
#cls1, cls2, cls3
[0 , 0.9 , 0.1 ], # class probabilities element 1
[0 , 0.1 , 0.9 ] # class probabilities element 2
然后我有几种构建meta = {
'name': 'my_seq', # safer to keep this with the data rather than as file name
'meta_val_1': 100, # not used by network, but may be useful when evaluating network's predictions for this particular sequence
'meta_val_2': 10
其中example = tf.train.Example(
features = tf.train.Features(
feature = {
'channel_1': tf.train.Feature(int64_list=tf.train.Int64List(value=seq[:,0])),
'channel_2': tf.train.Feature(int64_list=tf.train.Int64List(value=seq[:,1])),
'class_1' : tf.train.Feature(float_list=tf.train.FloatList(value=cls_probs[:,0])),
'class_2' : tf.train.Feature(float_list=tf.train.FloatList(value=cls_probs[:,1])),
'class_3' : tf.train.Feature(float_list=tf.train.FloatList(value=cls_probs[:,2])),
'name' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[f'{meta["name"]}'.encode('utf-8')])),
# should these be FloatList even though it is just a single value?
# should these be included here if they are not used by the network?
'val_1' : tf.train.Feature(float_list=tf.train.FloatList(value=[f'{meta["meta_val_1"]}'])),
'val_2' : tf.train.Feature(float_list=tf.train.FloatList(value=[f'{meta["meta_val_2"]}'])),
仅适用于python3.6 +)。
TF记录也接受另一种形式:example = tf.train.Example(
features = tf.train.Features(
feature = {
'sequence' : tf.train.Feature(bytes_list=tf.train.BytesList(value=seq.tostring())),
'cls_probs': tf.train.Feature(bytes_list=tf.train.BytesList(value=cls_probs.tostring())),
# ... see encoding of meta values from above
。 tf.train.SequenceExample
因此,将上述结构重构为 channels 示例:
同样,我们可以创建as string 示例:
example = tf.train.SequenceExample(
context = tf.train.Features(
feature = {
'Name' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[f'{meta["name"]}'.encode('utf-8')])),
'Val_1': tf.train.Feature(float_list=tf.train.FloatList(value=[f'{meta["meta_val_1"]}'])),
'Val_2': tf.train.Feature(float_list=tf.train.FloatList(value=[f'{meta["meta_val_2"]}'])),
feature_lists = tf.train.FeatureLists(
feature_list = {
'sequence': tf.train.FeatureList(
feature = [
'class_probabilities': tf.train.FeatureList(
feature = [
我在这里给了M.W.E.如何构造一个既example = tf.train.SequenceExample(
context = tf.train.Features(
# see above
feature_lists = tf.train.FeatureLists(
feature_list = {
'sequence': tf.train.FeatureList(
feature = [
'class_probabilities': tf.train.FeatureList(
feature = [
最后,我要指出这个medium post,它极大地说明了TF的文档。
答案 0 :(得分:0)
相反,在我看TensorFlow Records教程,帖子,视频等时,我想到的大多数示例(我遇到的)都专注于用具体数据构建(Sequence)示例,但没有显示如何制作更多示例动态地。因此,在示例中,我封装了上述四种用于转换所描述类型的数据的方法。
此文件已压缩到名为Feature Input / Output (FIO)的程序包中。
'my-feature': {'length': 'fixed', 'dtype': tf.string, 'shape': []},
'seq': {
'length': 'fixed',
'dtype': tf.int64,
'shape': [4, 3],
'encode': 'channels',
'channel_names': ['A', 'B', 'C'],
'data_format': 'channels_last'
允许您定义数据 _once _ 而不是两次(一次编码为示例,一次从记录中提取)。
import os, sys, json
sys.path.insert(0, '../')
import tensorflow as tf
import numpy as np
def list_like_q(value) -> bool:
TensorFlow tf.train.Feature requires a list of feature values.
Many values used in practice are either python lists or numpy.ndarrays.
We often have features which consist of a singular value.
For brevity, we define some light helper functions to wrap a list as a
tf.train.Feature. This lets us test if we need to wrap the value.
# import numpy as np
return (type(value) is list or type(value) is np.ndarray)
def take_all() -> slice: return slice(None, None, None)
def take_channel(sequence, channel:int, data_format:str='channels_last'):
slices = [channel, take_all()]
if data_format != 'channels_last': slices.reverse()
return sequence[tuple(slices)]
def number_of_channels(sequence, data_format:str='channels_last') -> int:
return sequence.shape[-1] if data_format == 'channels_last' else sequence.shape[0]
def feature_int64(value):
'''Takes value and wraps into tf.train.Feature(Int64List)'''
if not list_like_q(value): value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def feature_float(value):
'''Takes value and wraps into tf.train.Feature(FloatList)'''
if not list_like_q(value): value = [value]
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def feature_bytes(value):
'''Takes value and wraps is into tf.train.Feature(BytesList).'''
if type(value) is np.ndarray: value = value.tostring()
if type(value) is not bytes: value = str(value).encode('utf-8')
if type(value) is not list: value = [value]
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def feature_function(dtype):
Given <dtype> returns the function for wrapping a value into the
corresponding tf.train.Feature
return feature_int64 if dtype == "int64" else \
feature_float if dtype == "float" else \
def feature_list(iterable, dtype:str='float'):
'''Given an iterable, returns the feature list of corresponding <dtype>.'''
return tf.train.FeatureList([feature_function(dtype)(item) for item in iterable])
# the next three for completeness
def feature_list_int64(value):
return tf.train.FeatureList(feature=feature_list(value, 'int64'))
def feature_list_float(value):
return tf.train.FeatureList(feature=feature_list(value, 'float'))
def feature_list_bytes(value):
return tf.train.FeatureList(feature=feature_list(value, 'bytes'))
def dict_to_features(values:dict, types:dict) -> dict:
Given <types>, maps over name:dtype pairs and wraps <values>[name] in the
corresponding feature type.
return {name: feature_function(dtype)(values[name]) for name, dtype in types.items()}
def features_from_dict(values:dict, types:dict):
return tf.train.Features(feature=dict_to_features(values, types))
def default_channel_names(sequence, data_format:str='channels_last') -> list:
'''Ensures a naming scheme as required for channel based Example'''
return [f'Channel {i}' for i in range(number_of_channels(sequence, data_format))]
def channels_to_features(sequence, dtype:str='float', data_format:str='channels_last', channel_names:list=None) -> dict:
Given a <sequence> of corresponding <dtype> and <data_format>, with optional <channel_names>
returns the dictionary of each channel:tf.train.Feature pair.
if channel_names is None: channel_names = default_channel_names(sequence, data_format)
return {
channel: feature_function(dtype)(take_channel(sequence, i, data_format))
for i, channel in enumerate(channel_names)
def channels_to_feature_list(sequence, dtype:str='float', data_format:str='channels_last'):
Given a <sequence> of <dtype> and <data_format> returns the FeatureList
where each element corresponds to a channel of <sequence>
return tf.train.FeatureList(feature=list(channels_to_features(sequence, dtype, data_format).values()))
class SequenceRecord:
SequenceRecord is a supporting class built on top of the functions found in
/model/utils/features.py with the purpose of converting our data consisting
- a sequence of length n,
- n vectors of class probability vectors (refered to as pclasses), and
- metadata (name of sequence, start site, stop site, etc)
and converting it into a TensorFlow (Sequence)Example which can
subsequentially be written as a TensorFlow Record.
For both Example and SequenceExample options, the channels / classes of the
sequence / pclasses can be stored as numeric features (int64 / float) or as
a byte string. For each of these options, the encoding can be done per
channel / class, or the entire sequence / pclasses matrix.
Overwrite the following class variables to suit your needs:
_class_var || description
_metadata_types:dict || a dictionary of <feature-name>:<dtype> pairs which
|| is refered to when the metadata is converted into
|| tf.train.Feature (only 'int64', 'float', 'bytes' are
|| supported for <dtype>)
_sequence_data_format|| a string specifying where the channels are. By
|| default, this is set to 'channels_last'
_pclasses_data_format|| a string specifying where the channels are (by
|| default, this is set to 'channels_last')
_sequence_data_type || a string specifying what dtype channels should be
|| encoded as (by default 'int64')
_pclasses_data_type || a string specifying what dtype channels should be
|| encoded as (by default 'float')
_channel_names || a list of strings specifying the name and order
|| channels appear in <sequence> (by default set to
|| None)
_classes_names || a list of strings specifying the name and order
|| classes appear as channels in <pclasses> (by default
|| set to None)
_metadata_types = {}
_sequence_data_format = 'channels_last'
_pclasses_data_format = 'channels_last'
_sequence_data_type = 'int64'
_pclasses_data_type = 'float'
_channel_names = None
_classes_names = None
def make_example(self, sequence, pclasses, metadata:dict={}, form:str='example', by:str='channels'):
The core function of SequenceRecord. Given <sequence>, <pclasses> and <metadata>
converts them to the corresponing <form> and <by> the specified encoding schema.
form: either 'example' (default) or 'sequence' and yields either a
a Example or SequenceExample.
by: either 'channels' (default) or 'bstrings' or 'bdstring' and
encodes the sequence / pclasses by channel / class as a numeric,
or a byte string (options 'channels' and 'bstrings'), or dumps the
entire numpy.ndarray a byte string (option 'bdstring')
wrap = self.example if form == 'example' else self.sequence_example
return wrap(sequence, pclasses, metadata, by)
def example(self, sequence, pclasses, metadata, by='channels'):
wrap = self.example_as_channels if by == 'channels' else \
self.example_as_bdstring if by == 'bdstring' else \
return wrap(sequence, pclasses, metadata)
def sequence_example(self, sequence, pclasses, metadata, by='channels'):
wrap = self.sequence_example_as_channels if by == 'channels' else \
self.sequence_example_as_bdstring if by == 'bdstring' else \
return wrap(sequence, pclasses, metadata)
def example_as_channels(self, sequence, pclasses, metadata):
Encoded each channel (or class) as its own feature with specified dtype
(e.g. _sequence_data_type) and wraps in tf.train.Example
features = {
**dict_to_features(metadata, self._metadata_types),
**channels_to_features(sequence, self._sequence_data_type, self._sequence_data_format, self._channel_names),
**channels_to_features(pclasses, self._pclasses_data_type, self._pclasses_data_format, self._classes_names),
return tf.train.Example(features=tf.train.Features(feature=features))
def example_as_bstrings(self, sequence, pclasses, metadata):
Encoded each channel (or class) as its own feature but dumps ndarrays
as byte strings (<np.ndarray.tostring()>) and wraps in tf.train.Example.
features = {
**dict_to_features(metadata, self._metadata_types),
**channels_to_features(sequence, 'bytes', self._sequence_data_format, self._channel_names),
**channels_to_features(pclasses, 'bytes', self._pclasses_data_format, self._classes_names),
return tf.train.Example(features=tf.train.Features(feature=features))
def example_as_bdstring(self, sequence, pclasses, metadata):
Encodes sequence and probability classes as a byte 'dump' string
i.e. dump the sequence to a string and encode to bytes
( equivalent to np.ndarray.tostring() )
features = {
**dict_to_features(metadata, self._metadata_types),
'sequence': feature_bytes(sequence),
'pclasses': feature_bytes(pclasses)
return tf.train.Example(features=tf.train.Features(feature=features))
def sequence_example_as_channels(self, sequence, pclasses, metadata):
Encoded each channel (or class) as its own feature with specified dtype
(e.g. _sequence_data_type) and wraps in tf.train.SequenceExample
context = features_from_dict(metadata, self._metadata_types)
feat_list = tf.train.FeatureLists(feature_list={
'sequence': channels_to_feature_list(sequence, self._sequence_data_type, self._sequence_data_format),
'pclasses': channels_to_feature_list(pclasses, self._pclasses_data_type, self._pclasses_data_format)
return tf.train.SequenceExample(context=context, feature_lists=feat_list)
def sequence_example_as_bstrings(self, sequence, pclasses, metadata):
Encoded each channel (or class) as its own feature but dumps ndarrays
as byte strings (<np.ndarray.tostring()>) and wraps in
context = features_from_dict(metadata, self._metadata_types)
feat_list = tf.train.FeatureLists(feature_list={
'sequence': channels_to_feature_list(sequence, 'bytes', self._sequence_data_format),
'pclasses': channels_to_feature_list(pclasses, 'bytes', self._pclasses_data_format)
return tf.train.SequenceExample(context=context, feature_lists=feat_list)
def sequence_example_as_bdstring(self, sequence, pclasses, metadata):
Encodes sequence and probability classes as a byte 'dump' string
i.e. dump the sequence to a string and encode to bytes
( equivalent to np.ndarray.tostring() )
context = features_from_dict(metadata, self._metadata_types)
feat_list = tf.train.FeatureLists(feature_list={
'sequence': tf.train.FeatureList(feature=[feature_bytes(sequence)]),
'pclasses': tf.train.FeatureList(feature=[feature_bytes(pclasses)])
return tf.train.SequenceExample(context=context, feature_lists=feat_list)
def write(self, example, to:str):
After calling corresponding method to construct (Sequence)Example,
writes the passed (Sequence)Example to specified location (full path name).
with tf.python_io.TFRecordWriter(to) as writer:
sequences = np.array([
# sequence 1
# el1, el2, el3
[ 1, 1, 1], # channel 1
[ 2, 2, 2], # channel 2
[ 3, 3, 3], # channel 3
#sequence 2
[ 10, 10, 10], # channel 1
[ 20, 20, 20], # channel 2
[ 30, 30, 30], # channel 3
pclasses = np.array([
# sequence 1
# cls1, cls2, cls3
[ 0, 0.9, 0.1], # class probabilities element 1
[ 0, 0.1, 0.9], # class probabilities element 2
[ 0.8, 0.1, 0.1] # class probabilities element 3
# sequence 2
# cls1, cls2, cls3
[ 0.8, 0.1, 0.1], # class probabilities element 3
[ 0, 0.1, 0.9], # class probabilities element 2
[ 0, 0.9, 0.1] # class probabilities element 1
metadata = [
{'Name': 'sequence 1', 'Val_1': 100, 'Val_2': 10},
{'Name': 'sequence 2', 'Val_1': 10, 'Val_2': 100}
metatypes = {'Name': 'bytes', 'Val_1': 'float', 'Val_2': 'float'}
SequenceRecord._channel_names = ['Channel 1', 'Channel 2', 'Channel 3']
SequenceRecord._classes_names = ['Class A', 'Class B', 'Class C']
SequenceRecord._metadata_types = metatypes
SR = SequenceRecord()
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='channels')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='bstrings')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='bdstring')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='channels')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='bstrings')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='bdstring')
答案 1 :(得分:0)