如何从R keras中的predict_generator()输出中检查相应的文件

时间:2018-01-26 16:06:07

标签: r deep-learning keras

我有以下代码。数据集可以下载herehere。该数据集包含分类为catdog的图像。

此代码的任务是用于训练猫狗图像数据。 因此,给定一张图片,它可以判断它是猫还是狗。 这是page的动机。以下是成功运行的代码:

library(keras)
library(tidyverse)


# Organize dataset --------------------------------------------------------
options(warn = -1)

# Ths input
original_dataset_dir <- "data/kaggle_cats_dogs/original/"


# Create new organized dataset directory ----------------------------------

base_dir <- "data/kaggle_cats_dogs_small/"
dir.create(base_dir)

model_dir <- paste0(base_dir, "model/")
dir.create(model_dir)

train_dir <- file.path(base_dir, "train")
dir.create(train_dir)

validation_dir <- file.path(base_dir, "validation")
dir.create(validation_dir)

test_dir <- file.path(base_dir, "test")
dir.create(test_dir)

train_cats_dir <- file.path(train_dir, "cats")
dir.create(train_cats_dir)

train_dogs_dir <- file.path(train_dir, "dogs")
dir.create(train_dogs_dir)

validation_cats_dir <- file.path(validation_dir, "cats")
dir.create(validation_cats_dir)

validation_dogs_dir <- file.path(validation_dir, "dogs")
dir.create(validation_dogs_dir)

test_cats_dir <- file.path(test_dir, "cats")
dir.create(test_cats_dir)

test_dogs_dir <- file.path(test_dir, "dogs")
dir.create(test_dogs_dir)

# Copying files from original dataset to newly created directory
fnames <- paste0("cat.", 1:1000, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(train_cats_dir)
)


fnames <- paste0("cat.", 1001:1500, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(validation_cats_dir)
)

fnames <- paste0("cat.", 1501:2000, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(test_cats_dir)
)

fnames <- paste0("dog.", 1:1000, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(train_dogs_dir)
)

fnames <- paste0("dog.", 1001:1500, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(validation_dogs_dir)
)

fnames <- paste0("dog.", 1501:2000, ".jpg")
dum <- file.copy(
  file.path(original_dataset_dir, fnames),
  file.path(test_dogs_dir)
)

options(warn = 0)

# Making model ------------------------------------------------------------


conv_base <- application_vgg16(
  weights = "imagenet",
  include_top = FALSE,
  input_shape = c(150, 150, 3)
)


model <- keras_model_sequential() %>%
  conv_base() %>%
  layer_flatten() %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 1, activation = "sigmoid")

summary(model)

length(model$trainable_weights)
freeze_weights(conv_base)
length(model$trainable_weights)



# Train model -------------------------------------------------------------

train_datagen <- image_data_generator(
  rescale = 1 / 255,
  rotation_range = 40,
  width_shift_range = 0.2,
  height_shift_range = 0.2,
  shear_range = 0.2,
  zoom_range = 0.2,
  horizontal_flip = TRUE,
  fill_mode = "nearest"
)

# Note that the validation data shouldn't be augmented!
test_datagen <- image_data_generator(rescale = 1 / 255)

train_generator <- flow_images_from_directory(
  train_dir, # Target directory
  train_datagen, # Data generator
  target_size = c(150, 150), # Resizes all images to 150 × 150
  batch_size = 20,
  class_mode = "binary" # binary_crossentropy loss for binary labels
)

test_generator <- flow_images_from_directory(
  test_dir, # Target directory
  train_datagen, # Data generator
  target_size = c(150, 150), # Resizes all images to 150 × 150
  batch_size = 20,
  class_mode = "binary" # binary_crossentropy loss for binary labels
)

validation_generator <- flow_images_from_directory(
  validation_dir,
  test_datagen,
  target_size = c(150, 150),
  batch_size = 20,
  class_mode = "binary"
)


# Fine tuning -------------------------------------------------------------


unfreeze_weights(conv_base, from = "block3_conv1")

# Compile model -----------------------------------------------------------



model %>% compile(
  loss = "binary_crossentropy",
  optimizer = optimizer_rmsprop(lr = 2e-5),
  metrics = c("accuracy")
)



# Evaluate  by epochs  ---------------------------------------------------------------

#  # This create plots accuracy of various epochs (slow)
history <- model %>% fit_generator(
  train_generator,
  steps_per_epoch = 100,
  epochs = 50, # was 50
  validation_data = validation_generator,
  validation_steps = 50
)

# Plot --------------------------------------------------------------------
# plot(history)

评估,预测概率代码是这个

# Check classes of data  --------------------------------------------------

train_generator$class_indices


# Evaluate ----------------------------------------------------------------
model %>% evaluate_generator(test_generator, steps = 50)
#$loss
#[1] 0.3161949
#$acc
#[1] 0.932

predict <- model %>%
  predict_generator(test_generator, step = 50, verbose = 1)

predict的输出是:

as.tibble(predict) %>%
  rename(predict_proba = V1) %>%
  mutate(label = ifelse(predict_proba > 0.5, 1, 0)) %>% 
  mutate(label = as.integer(label)) %>% 
  mutate(label_name = ifelse(label == 0, "cat", "dog")) %>% 
  head(n=5)

# A tibble: 5 x 3
#  predict_proba label label_name
#          <dbl> <int>      <chr>
#1  1.000000e+00     1        dog
#2  4.278725e-02     0        cat
#3  4.198529e-15     0        cat
#4  8.683033e-06     0        cat
#5  1.000000e+00     1        dog

我的问题是基于predict我想知道的概率 存储在test_dir

中的相应文件

基本上我想通过眼睛检查概率分配是否正确和合理。

我的base_dir目录的结构如下:

.
|-- model
|-- test
|   |-- cats
|   `-- dogs
|-- train
|   |-- cats
|   `-- dogs
`-- validation
    |-- cats
    `-- dogs

enter image description here

更新

我试过[sladomic]的建议。并制作以下​​代码:

stat_df <- as.tibble(cbind(predict, test_generator$filenames)) %>%
  # assign prediction probability for filenames
  rename(
    predict_proba = V1,
    filename = V2
  ) %>%
  mutate(predicted_label = ifelse(predict_proba > 0.5, 1, 0)) %>%
  mutate(predicted_label = as.integer(predicted_label)) %>%
  mutate(predicted_label_name = ifelse(predicted_label == 0, "cats", "dogs")) %>%
  # image name is the true label name
  separate(filename, into=c("true_label","fname"), sep = "[//]" )

stat_df

我得到以下数据框:

> stat_df
# A tibble: 1,000 x 5
          predict_proba true_label        fname predicted_label predicted_label_name
 *                <chr>      <chr>        <chr>           <int>                <chr>
 1 2.45413622756985e-09       cats cat.1501.jpg               1                 dogs
 2 4.18112916275648e-20       cats cat.1502.jpg               1                 dogs
 3 1.25922511529097e-07       cats cat.1503.jpg               1                 dogs
 4 3.76460201987477e-14       cats cat.1504.jpg               1                 dogs
 5 6.77461059694906e-07       cats cat.1505.jpg               1                 dogs
 6 0.000256105791777372       cats cat.1506.jpg               0                 cats
 7    0.959224164485931       cats cat.1507.jpg               1                 dogs
 8 0.000318235805025324       cats cat.1508.jpg               0                 cats
 9 9.03555774129927e-05       cats cat.1509.jpg               1                 dogs
10 2.40483113884693e-05       cats cat.1510.jpg               1                 dogs

我检查了1000张测试图像,这些是预测标签的比例:

> stat_df %>% group_by(predicted_label_name) %>% summarise(n=n())
# A tibble: 2 x 2
  predicted_label_name     n
                 <chr> <int>
1                 cats   191
2                 dogs   809

我检查正确预测为狗或猫的预测数量

 > stat_df %>% filter(true_label == predicted_label_name & true_label == "dogs")  %>% dim()
[1] 439   5
> stat_df %>% filter(true_label == predicted_label_name & true_label == "cats")  %>% dim()
[1] 130   5

其中说809预测只有439被正确预测为狗(准确率约为54%)。这是为什么?我错过了什么?

请注意,evaluate_generator()的准确率约为93%。 什么是正确的解释?

2 个答案:

答案 0 :(得分:2)

test_generator$filenames为您提供了文件名列表

答案 1 :(得分:1)

OP解决方案。

通过添加mutate(predict_proba = as.double(predict_proba)解决问题,此处:

stat_df <- as.tibble(cbind(predict, test_generator$filenames)) %>%
  # assign prediction probability for filenames
  rename(
    predict_proba = V1,
    filename = V2
  ) %>%
  mutate(predict_proba = as.double(predict_proba) %>%
  mutate(predicted_label = ifelse(predict_proba > 0.5, 1, 0)) %>%
  mutate(predicted_label = as.integer(predicted_label)) %>%
  mutate(predicted_label_name = ifelse(predicted_label == 0, "cats", "dogs")) %>%
  # image name is the true label name
  separate(filename, into=c("true_label","fname"), sep = "[//]" )