我正在建立我的tf数据集,其中有多个输入(图像和数字/分类数据)。我遇到的问题是多个图像对应于我拥有的pd.Dataframe中的同一行。 我正在做回归。
那么,如何(即使在混洗所有输入时)也要确保每个图像都映射到正确的行?
再说一遍,我有10行和100张图像,其中10张图像对应于特定行。现在,我们对数据集进行混洗,并希望确保混洗后的图像均与它们各自的行相对应。
我正在使用tf.data.Dataset
来执行此操作。 我还有一个目录结构,使得文件夹名称与DataFrame中的元素相对应,如果我知道如何进行映射,这就是我正在考虑使用的内容
即folder1
将与dir_name, feature1, feature2, ...
这样的cols一起位于df中。 自然,dir_names
不应作为数据传递到适合的模型中。
#images
path_ds = tf.data.Dataset.from_tensor_slices(paths)
image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
#numerical&categorical features. First remove the dirs
x_train_input = X_train[X_train.columns.difference(['dir_name'])]
x_train_input=np.expand_dims(x_train_input, axis=1)
text_ds = tf.data.Dataset.from_tensor_slices(x_train_input)
#labels, y_train's cols are: 'label' and 'dir_name'
label_ds = tf.data.Dataset.from_tensor_slices(
tf.cast(y_train['label'], tf.float32))
# test creation of dataset without prior shuffling.
xtrain_ = tf.data.Dataset.zip((image_ds, text_ds))
model_ds = tf.data.Dataset.zip((xtrain_, label_ds))
# Shuffling
BATCH_SIZE = 64
# Setting a shuffle buffer size as large as the dataset ensures that
# data is completely shuffled
ds = model_ds.shuffle(buffer_size=len(paths))
ds = ds.repeat()
ds = ds.batch(BATCH_SIZE)
# prefetch lets the dataset fetch batches in the background while the
# model is training
# ds = ds.prefetch(buffer_size=AUTOTUNE)
ds = ds.prefetch(buffer_size=BATCH_SIZE)
答案 0 :(得分:1)
我的解决方案是利用TFRecords来存储数据并保持其完整性。这也将为其他效率打开大门。
下面的代码在做什么...
feature_dimensions
对象feature_dimensions
对象来记住如何将数据放回原处您应该能够只运行此示例代码,而没有任何问题。然后只需进行更改即可解决问题。
import tensorflow as tf
import pandas as pd
import numpy as np
from functools import partial
# Create dummy data, TODO replace with your own logic
# 10 images per row in DF
images_per_example = 10
examples = 200
# Save name for TFRecords, you can create multiple and pass a list of the names as well
save_name = "my_tfrecords.tfrecords"
# DF, dataframe with random categorical data
x_data = pd.DataFrame(data=(np.random.normal(size=(examples, 50)) > 0).astype(np.float32))
y_data = np.random.uniform(0, 1, size=(examples, )).reshape(-1, 1).astype(np.float32)
def load_and_preprocess_image(file):
# For dummy purposes generating instead of loading
img = np.random.uniform(high=255, low=0, size=(15, 15))
return (img / 255.).astype(np.float32)
# I would preprocess your images prior to creating the tfrecords file
img_data = np.array([[load_and_preprocess_image("add_logic") for j in range(images_per_example)]
for k in range(examples)])
# Prepare for tfrecords
data_dict = dict()
data_dict["images"] = img_data # Already an array
data_dict["x_data"] = x_data.values # Ensure it's an array
data_dict["y_data"] = y_data # Already an array
# Remember the dimensions for later restoration, replacing number of examples with -1
feature_dimensions = {k: v.shape for k, v in data_dict.items()}
feature_dimensions = {k: tuple([-1] + list(v[1:])) for k, v in feature_dimensions.items()}
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
writer = tf.python_io.TFRecordWriter(save_name)
# Create TFRecords file
for i in range(examples):
example_dict = dict() # New dictionary for each single example
for name, data in data_dict.items():
# if name == "images":
# break
example_dict[name] = data[i]
# Define the features of your tfrecord
feature = {k: _bytes_feature(tf.compat.as_bytes(v.tostring())) for k, v in example_dict.items()}
# Serialize to string and write to file
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
# Declare functions for creating dataset
def _parse_function(proto, feature_dimensions_: dict):
# define your tfrecord again. Remember that you saved your image as a string.
keys_to_features = {k: tf.FixedLenFeature([], tf.string) for k in feature_dimensions_.keys()}
# Load one example
parsed_features = tf.parse_single_example(proto, keys_to_features)
# Split data
for k, v in parsed_features.items():
parsed_features[k] = tf.decode_raw(v, tf.float32)
return parsed_features
def create_tf_dataset(file_paths: str, feature_dimensions_: dict, batch_size=64):
# This works with arrays as well
dataset = tf.data.TFRecordDataset(file_paths)
# Maps the parser on every filepath in the array. You can set the number of parallel loaders here
parse_function = partial(_parse_function, feature_dimensions_=feature_dimensions_)
dataset = dataset.map(parse_function, num_parallel_calls=1)
# This dataset will go on forever
dataset = dataset.repeat()
# Set the number of datapoints you want to load and shuffle
dataset = dataset.shuffle(batch_size) # Put whatever you want here
# Set the batchsize
dataset = dataset.batch(batch_size)
# Set up a pipeline
dataset = dataset.prefetch(batch_size) # Put whatever you want here
# Create an iterator
iterator = dataset.make_one_shot_iterator()
# Create your tf representation of the iterator
parsed_features = iterator.get_next()
# Reshape arrays and cast to float
for k, v in parsed_features.items():
parsed_features[k] = tf.reshape(v, feature_dimensions_[k])
for k, v in parsed_features.items():
parsed_features[k] = tf.cast(v, tf.float32)
return parsed_features
# Create dataset
ds = create_tf_dataset(save_name, feature_dimensions, batch_size=64)
# The final result is a dictionary with the names used above
sample = tf.Session().run(ds)
print("Sample Length:", len(sample))
print("Sample Keys:", sample.keys())
print("images shape:", sample["images"].shape)
print("x_data shape:", sample["x_data"].shape)
print("y_data shape:", sample["y_data"].shape)
打印结果
Sample Length: 3 Sample Keys: dict_keys(['images', 'x_data', 'y_data']) images shape: (64, 10, 15, 15) x_data shape: (64, 50) y_data shape: (64, 1)