使用tf.data

时间:2018-10-13 12:38:23

标签: python tensorflow mnist

我正在尝试使用tf.data将具有60000个图像的训练mnist数据集拆分为55000个训练图像和5000个验证图像。

我跑步时

session_config = tf.ConfigProto(log_device_placement=False)
config = tf.estimator.RunConfig(tf_random_seed=230,
                                model_dir=chpt_dir_path,
                                save_checkpoints_steps=params["save_checkpoints_steps"],
                                keep_checkpoint_max=params["keep_checkpoint_max"],
                                session_config=session_config)
estimator = tf.estimator.Estimator(model_fn=model_fn, params=params, config=config)
train_dataset, valid_dataset = train_input_fn(args.DATA_DIR_PATH, params)
estimator.train(lambda: train_dataset)

错误是:

  

Tensor(“ Iterator:0”,shape =(),dtype = resource)必须与Tensor(“ PrefetchDataset:0”,shape =(),dtype = variant)来自同一张图。

问题来自此功能

def train_input_fn(data_dir_path, params):
    """Train input function for the MNIST dataset.

    Args:
        data_dir: (string) path to the data directory
        params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`)
    """
    dataset = train(data_dir_path)
    dataset = dataset.shuffle(params["train_size"] + params["valid_size"], seed=416)  # whole dataset into the buffer

    train_dataset = dataset.take(params["train_size"])
    valid_dataset = dataset.skip(params["train_size"])

    train_dataset = train_dataset.batch(params["batch_size"])
    train_dataset = train_dataset.shuffle(params["train_size"])
    train_dataset = train_dataset.prefetch(1)  # make sure you always have one batch ready to serve

    valid_dataset = valid_dataset.batch(params["batch_size"])
    valid_dataset = valid_dataset.shuffle(params["valid_size"])
    valid_dataset = valid_dataset.prefetch(1)  # make sure you always have one batch ready to serve
    return train_dataset, valid_dataset

我不知道如何解决此问题。有人知道吗?还是有更好的方法与td.data分割数据?

以下代码用于加载mnist数据集并创建数据管道。

def download(data_dir_path, filename):
    """Download (and unzip) a file from the MNIST dataset if not already done."""
    filepath = os.path.join(data_dir_path, filename)
    if tf.gfile.Exists(filepath):
        return filepath
    if not tf.gfile.Exists(data_dir_path):
        tf.gfile.MakeDirs(data_dir_path)

    # CVDF mirror of http://yann.lecun.com/exdb/mnist/
    url = "https://storage.googleapis.com/cvdf-datasets/mnist/" + filename + ".gz"
    zipped_filepath = filepath + ".gz"
    print("Downloading %s to %s" % (url, zipped_filepath))
    urllib.request.urlretrieve(url, zipped_filepath)
    with gzip.open(zipped_filepath, "rb") as f_in, open(filepath, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
    os.remove(zipped_filepath)
    return filepath


def dataset(data_dir_path, images_file, labels_file):
    images_file_path = download(data_dir_path, images_file)
    labels_file_path = download(data_dir_path, labels_file)

    def decode_image(image):
        # Normalize from [0, 255] to [0.0, 1.0]
        image = tf.decode_raw(image, tf.uint8)
        image = tf.cast(image, tf.float32)
        image = tf.reshape(image, [784])
        return image / 255.0

    def decode_label(label):
        label = tf.decode_raw(label, tf.uint8)  # tf.string -> [tf.uint8]
        label = tf.reshape(label, [])  # label is a scalar
        return tf.to_int32(label)

    images = tf.data.FixedLengthRecordDataset(images_file_path, 28 * 28, header_bytes=16)
    images = images.map(decode_image)
    labels = tf.data.FixedLengthRecordDataset(labels_file_path, 1, header_bytes=8)
    labels = labels.map(decode_label)
    return tf.data.Dataset.zip((images, labels))

def train(data_dir_path):
    """tf.data.Dataset object for MNIST training data."""
    return dataset(data_dir_path, "train-images-idx3-ubyte", "train-labels-idx1-ubyte")

0 个答案:

没有答案