我正在尝试使用Tensorflow的数据集API从多个csv文件中读取数据。
以下版本的代码可以正常使用:
record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.]]
def decode_csv(line):
col1, col2, col3, col4, col5, col6, col7 = tf.decode_csv(line, record_defaults)
features = tf.stack([col2, col3, col4, col5, col6])
labels = tf.stack([col7])
return features, labels
filenames = tf.placeholder(tf.string, shape=[None])
dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
dataset5 = dataset5.shuffle(buffer_size=1000)
dataset5 = dataset5.batch(7)
iterator5 = dataset5.make_initializable_iterator()
但是我想让它更具动态性,因为#columns(#features)可能会在不同的项目中发生变化。但是,当我将代码更改为以下内容时,它就无法正常工作。花费大量时间来解决问题也无济于事。
record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.]]
def decode_csv(line):
csv_columns = tf.decode_csv(line, record_defaults)
labels = csv_columns[-1] # last column is the label
del csv_columns[-1] # delete the last column
del csv_columns[0] # delete the first column bcz not a feature
features = csv_columns
return features, labels
filenames = tf.placeholder(tf.string, shape=[None])
dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
dataset5 = dataset5.shuffle(buffer_size=1000)
dataset5 = dataset5.batch(7)
iterator5 = dataset5.make_initializable_iterator()
当我运行上面的第二个版本时,我收到以下错误..也许一个更有经验的人在这里立即看到问题..?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-21-92ea8cc44da0> in <module>()
18 filenames = tf.placeholder(tf.string, shape=[None])
19 dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
---> 20 dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
21 dataset5 = dataset5.shuffle(buffer_size=1000)
22 dataset5 = dataset5.batch(7)
~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in flat_map(self, map_func)
799 Dataset: A `Dataset`.
800 """
--> 801 return FlatMapDataset(self, map_func)
802
803 def interleave(self, map_func, cycle_length, block_length=1):
~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in __init__(self, input_dataset, map_func)
1676
1677 self._map_func = tf_map_func
-> 1678 self._map_func.add_to_graph(ops.get_default_graph())
1679
1680 def _as_variant_tensor(self):
~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in add_to_graph(self, g)
484 def add_to_graph(self, g):
485 """Adds this function into the graph g."""
--> 486 self._create_definition_if_needed()
487
488 # Adds this function into 'g'.
~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed(self)
319 """Creates the function definition if it's not created yet."""
320 with context.graph_mode():
--> 321 self._create_definition_if_needed_impl()
322
323 def _create_definition_if_needed_impl(self):
~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed_impl(self)
336 # Call func and gather the output tensors.
337 with vs.variable_scope("", custom_getter=temp_graph.getvar):
--> 338 outputs = self._func(*inputs)
339
340 # There is no way of distinguishing between a function not returning
~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in tf_map_func(*args)
1664 dataset = map_func(*nested_args)
1665 else:
-> 1666 dataset = map_func(nested_args)
1667
1668 if not isinstance(dataset, Dataset):
<ipython-input-21-92ea8cc44da0> in <lambda>(filename)
18 filenames = tf.placeholder(tf.string, shape=[None])
19 dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
---> 20 dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
21 dataset5 = dataset5.shuffle(buffer_size=1000)
22 dataset5 = dataset5.batch(7)
~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in map(self, map_func, num_parallel_calls)
784 """
785 if num_parallel_calls is None:
--> 786 return MapDataset(self, map_func)
787 else:
788 return ParallelMapDataset(self, map_func, num_parallel_calls)
~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in __init__(self, input_dataset, map_func)
1587
1588 self._map_func = tf_map_func
-> 1589 self._map_func.add_to_graph(ops.get_default_graph())
1590
1591 def _as_variant_tensor(self):
~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in add_to_graph(self, g)
484 def add_to_graph(self, g):
485 """Adds this function into the graph g."""
--> 486 self._create_definition_if_needed()
487
488 # Adds this function into 'g'.
~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed(self)
319 """Creates the function definition if it's not created yet."""
320 with context.graph_mode():
--> 321 self._create_definition_if_needed_impl()
322
323 def _create_definition_if_needed_impl(self):
~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed_impl(self)
336 # Call func and gather the output tensors.
337 with vs.variable_scope("", custom_getter=temp_graph.getvar):
--> 338 outputs = self._func(*inputs)
339
340 # There is no way of distinguishing between a function not returning
~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in tf_map_func(*args)
1575 self._output_classes = sparse.get_classes(ret)
1576 self._output_shapes = nest.pack_sequence_as(
-> 1577 ret, [t.get_shape() for t in nest.flatten(ret)])
1578 self._output_types = nest.pack_sequence_as(
1579 ret, [t.dtype for t in nest.flatten(ret)])
~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in <listcomp>(.0)
1575 self._output_classes = sparse.get_classes(ret)
1576 self._output_shapes = nest.pack_sequence_as(
-> 1577 ret, [t.get_shape() for t in nest.flatten(ret)])
1578 self._output_types = nest.pack_sequence_as(
1579 ret, [t.dtype for t in nest.flatten(ret)])
AttributeError: 'list' object has no attribute 'get_shape'
附录:
以下也有效。
feature_names = ['f0','f1','f2','f3','f4','f5']
record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.]]
def decode_csv(line):
parsed_line = tf.decode_csv(line, record_defaults) # => tensor
label = parsed_line[-1]
del parsed_line[-1]
features = parsed_line
d = dict(zip(feature_names,features)),label
return d
filenames = tf.placeholder(tf.string, shape=[None])
dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
dataset5 = dataset5.shuffle(buffer_size=1000)
dataset5 = dataset5.batch(7)
iterator5 = dataset5.make_initializable_iterator()
但是现在decode_csv函数正在返回(feature_name,feature_value)对的字典。为什么有人喜欢从这个函数返回一个字典?是不是很难对像前向传播这样的计算进行矢量化?
答案 0 :(得分:1)
解决。以下是工作版本。我没有复制整个东西以节省一些空间。 在excel文件中,第一列不是功能,只是一个训练示例ID。最后一列只是标签。使用tf.stack(...)函数堆叠功能解决了这个问题。
feature_names = ['f1','f2','f3','f4','f5']
record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]
def decode_csv(line):
parsed_line = tf.decode_csv(line, record_defaults)
label = parsed_line[-1]
del parsed_line[-1]
del parsed_line[0]
features = tf.stack(parsed_line) # ADDED LINE
d = features, label
return d
filenames = tf.placeholder(tf.string, shape=[None])
dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
dataset5 = dataset5.shuffle(buffer_size=1000)
dataset5 = dataset5.batch(7)
iterator5 = dataset5.make_initializable_iterator()