Question

我正在建立我的tf数据集，其中有多个输入（图像和数字/分类数据）。我遇到的问题是多个图像对应于我拥有的pd.Dataframe中的同一行。 我正在做回归。

那么，如何（即使在混洗所有输入时）也要确保每个图像都映射到正确的行？

再说一遍，我有10行和100张图像，其中10张图像对应于特定行。现在，我们对数据集进行混洗，并希望确保混洗后的图像均与它们各自的行相对应。

我正在使用tf.data.Dataset来执行此操作。 我还有一个目录结构，使得文件夹名称与DataFrame中的元素相对应，如果我知道如何进行映射，这就是我正在考虑使用的内容

即folder1将与dir_name, feature1, feature2, ...这样的cols一起位于df中。 自然，dir_names不应作为数据传递到适合的模型中。

#images
path_ds = tf.data.Dataset.from_tensor_slices(paths)
image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

#numerical&categorical features. First remove the dirs
x_train_input = X_train[X_train.columns.difference(['dir_name'])]
x_train_input=np.expand_dims(x_train_input, axis=1)
text_ds = tf.data.Dataset.from_tensor_slices(x_train_input)

#labels, y_train's cols are: 'label' and 'dir_name'
label_ds = tf.data.Dataset.from_tensor_slices(
    tf.cast(y_train['label'], tf.float32))

# test creation of dataset without prior shuffling.
xtrain_ = tf.data.Dataset.zip((image_ds, text_ds))
model_ds = tf.data.Dataset.zip((xtrain_, label_ds))


# Shuffling
BATCH_SIZE = 64

# Setting a shuffle buffer size as large as the dataset ensures that
# data is completely shuffled
ds = model_ds.shuffle(buffer_size=len(paths))
ds = ds.repeat()
ds = ds.batch(BATCH_SIZE)
# prefetch lets the dataset fetch batches in the background while the
# model is training
# ds = ds.prefetch(buffer_size=AUTOTUNE)
ds = ds.prefetch(buffer_size=BATCH_SIZE)

Answer 1

我的解决方案是利用TFRecords来存储数据并保持其完整性。这也将为其他效率打开大门。

下面的代码在做什么...

创建伪数据。所有都必须是在_parse_function中具有相同数据类型的数组。您可以更改该dtype，也要确保也为数据更改它。
创建一个按名称保存数组的字典
创建保存所有数组形状的feature_dimensions对象
通过遍历数据字典来创建TFRecords。您可以创建一个大文件，也可以创建许多小文件。但是，这对您来说是一个很好的起点。
声明用于生成数据集的函数。您可以在此处添加和修改所需的任何逻辑。但是，关键是这些函数使用feature_dimensions对象来记住如何将数据放回原处
创建数据集
生成样本。结果是一本字典，其中包含一批数据。

您应该能够只运行此示例代码，而没有任何问题。然后只需进行更改即可解决问题。

import tensorflow as tf
import pandas as pd
import numpy as np
from functools import partial

# Create dummy data, TODO replace with your own logic
# 10 images per row in DF
images_per_example = 10
examples = 200

# Save name for TFRecords, you can create multiple and pass a list of the names as well
save_name = "my_tfrecords.tfrecords"

# DF, dataframe with random categorical data
x_data = pd.DataFrame(data=(np.random.normal(size=(examples, 50)) > 0).astype(np.float32))
y_data = np.random.uniform(0, 1, size=(examples, )).reshape(-1, 1).astype(np.float32)


def load_and_preprocess_image(file):
    # For dummy purposes generating instead of loading
    img = np.random.uniform(high=255, low=0, size=(15, 15))
    return (img / 255.).astype(np.float32)


# I would preprocess your images prior to creating the tfrecords file
img_data = np.array([[load_and_preprocess_image("add_logic") for j in range(images_per_example)]
                     for k in range(examples)])

# Prepare for tfrecords
data_dict = dict()
data_dict["images"] = img_data  # Already an array
data_dict["x_data"] = x_data.values  # Ensure it's an array
data_dict["y_data"] = y_data  # Already an array

# Remember the dimensions for later restoration, replacing number of examples with -1
feature_dimensions = {k: v.shape for k, v in data_dict.items()}
feature_dimensions = {k: tuple([-1] + list(v[1:])) for k, v in feature_dimensions.items()}


def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


writer = tf.python_io.TFRecordWriter(save_name)

# Create TFRecords file
for i in range(examples):

    example_dict = dict()  # New dictionary for each single example
    for name, data in data_dict.items():
        # if name == "images":
        #     break
        example_dict[name] = data[i]

    # Define the features of your tfrecord
    feature = {k: _bytes_feature(tf.compat.as_bytes(v.tostring())) for k, v in example_dict.items()}

    # Serialize to string and write to file
    example = tf.train.Example(features=tf.train.Features(feature=feature))
    writer.write(example.SerializeToString())

writer.close()


# Declare functions for creating dataset
def _parse_function(proto, feature_dimensions_: dict):
    # define your tfrecord again. Remember that you saved your image as a string.
    keys_to_features = {k: tf.FixedLenFeature([], tf.string) for k in feature_dimensions_.keys()}

    # Load one example
    parsed_features = tf.parse_single_example(proto, keys_to_features)

    # Split data
    for k, v in parsed_features.items():
        parsed_features[k] = tf.decode_raw(v, tf.float32)

    return parsed_features


def create_tf_dataset(file_paths: str, feature_dimensions_: dict, batch_size=64):
    # This works with arrays as well
    dataset = tf.data.TFRecordDataset(file_paths)

    # Maps the parser on every filepath in the array. You can set the number of parallel loaders here
    parse_function = partial(_parse_function, feature_dimensions_=feature_dimensions_)
    dataset = dataset.map(parse_function, num_parallel_calls=1)

    # This dataset will go on forever
    dataset = dataset.repeat()

    # Set the number of datapoints you want to load and shuffle
    dataset = dataset.shuffle(batch_size)  # Put whatever you want here

    # Set the batchsize
    dataset = dataset.batch(batch_size)

    # Set up a pipeline
    dataset = dataset.prefetch(batch_size)  # Put whatever you want here

    # Create an iterator
    iterator = dataset.make_one_shot_iterator()

    # Create your tf representation of the iterator
    parsed_features = iterator.get_next()

    # Reshape arrays and cast to float
    for k, v in parsed_features.items():
        parsed_features[k] = tf.reshape(v, feature_dimensions_[k])
    for k, v in parsed_features.items():
        parsed_features[k] = tf.cast(v, tf.float32)

    return parsed_features


# Create dataset
ds = create_tf_dataset(save_name, feature_dimensions, batch_size=64)

# The final result is a dictionary with the names used above
sample = tf.Session().run(ds)

print("Sample Length:", len(sample))
print("Sample Keys:", sample.keys())
print("images shape:", sample["images"].shape)
print("x_data shape:", sample["x_data"].shape)
print("y_data shape:", sample["y_data"].shape)

打印结果

Sample Length: 3
Sample Keys: dict_keys(['images', 'x_data', 'y_data'])
images shape: (64, 10, 15, 15)
x_data shape: (64, 50)
y_data shape: (64, 1)

Tensorflow 1.13.1 tf.data将同一行的多个图像映射在一起

1 个答案: