我有以下代码。数据集可以下载here或here。该数据集包含分类为cat
或dog
的图像。
此代码的任务是用于训练猫狗图像数据。 因此,给定一张图片,它可以判断它是猫还是狗。 这是page的动机。以下是成功运行的代码:
library(keras)
library(tidyverse)
# Organize dataset --------------------------------------------------------
options(warn = -1)
# Ths input
original_dataset_dir <- "data/kaggle_cats_dogs/original/"
# Create new organized dataset directory ----------------------------------
base_dir <- "data/kaggle_cats_dogs_small/"
dir.create(base_dir)
model_dir <- paste0(base_dir, "model/")
dir.create(model_dir)
train_dir <- file.path(base_dir, "train")
dir.create(train_dir)
validation_dir <- file.path(base_dir, "validation")
dir.create(validation_dir)
test_dir <- file.path(base_dir, "test")
dir.create(test_dir)
train_cats_dir <- file.path(train_dir, "cats")
dir.create(train_cats_dir)
train_dogs_dir <- file.path(train_dir, "dogs")
dir.create(train_dogs_dir)
validation_cats_dir <- file.path(validation_dir, "cats")
dir.create(validation_cats_dir)
validation_dogs_dir <- file.path(validation_dir, "dogs")
dir.create(validation_dogs_dir)
test_cats_dir <- file.path(test_dir, "cats")
dir.create(test_cats_dir)
test_dogs_dir <- file.path(test_dir, "dogs")
dir.create(test_dogs_dir)
# Copying files from original dataset to newly created directory
fnames <- paste0("cat.", 1:1000, ".jpg")
dum <- file.copy(
file.path(original_dataset_dir, fnames),
file.path(train_cats_dir)
)
fnames <- paste0("cat.", 1001:1500, ".jpg")
dum <- file.copy(
file.path(original_dataset_dir, fnames),
file.path(validation_cats_dir)
)
fnames <- paste0("cat.", 1501:2000, ".jpg")
dum <- file.copy(
file.path(original_dataset_dir, fnames),
file.path(test_cats_dir)
)
fnames <- paste0("dog.", 1:1000, ".jpg")
dum <- file.copy(
file.path(original_dataset_dir, fnames),
file.path(train_dogs_dir)
)
fnames <- paste0("dog.", 1001:1500, ".jpg")
dum <- file.copy(
file.path(original_dataset_dir, fnames),
file.path(validation_dogs_dir)
)
fnames <- paste0("dog.", 1501:2000, ".jpg")
dum <- file.copy(
file.path(original_dataset_dir, fnames),
file.path(test_dogs_dir)
)
options(warn = 0)
# Making model ------------------------------------------------------------
conv_base <- application_vgg16(
weights = "imagenet",
include_top = FALSE,
input_shape = c(150, 150, 3)
)
model <- keras_model_sequential() %>%
conv_base() %>%
layer_flatten() %>%
layer_dense(units = 256, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
summary(model)
length(model$trainable_weights)
freeze_weights(conv_base)
length(model$trainable_weights)
# Train model -------------------------------------------------------------
train_datagen <- image_data_generator(
rescale = 1 / 255,
rotation_range = 40,
width_shift_range = 0.2,
height_shift_range = 0.2,
shear_range = 0.2,
zoom_range = 0.2,
horizontal_flip = TRUE,
fill_mode = "nearest"
)
# Note that the validation data shouldn't be augmented!
test_datagen <- image_data_generator(rescale = 1 / 255)
train_generator <- flow_images_from_directory(
train_dir, # Target directory
train_datagen, # Data generator
target_size = c(150, 150), # Resizes all images to 150 × 150
batch_size = 20,
class_mode = "binary" # binary_crossentropy loss for binary labels
)
test_generator <- flow_images_from_directory(
test_dir, # Target directory
train_datagen, # Data generator
target_size = c(150, 150), # Resizes all images to 150 × 150
batch_size = 20,
class_mode = "binary" # binary_crossentropy loss for binary labels
)
validation_generator <- flow_images_from_directory(
validation_dir,
test_datagen,
target_size = c(150, 150),
batch_size = 20,
class_mode = "binary"
)
# Fine tuning -------------------------------------------------------------
unfreeze_weights(conv_base, from = "block3_conv1")
# Compile model -----------------------------------------------------------
model %>% compile(
loss = "binary_crossentropy",
optimizer = optimizer_rmsprop(lr = 2e-5),
metrics = c("accuracy")
)
# Evaluate by epochs ---------------------------------------------------------------
# # This create plots accuracy of various epochs (slow)
history <- model %>% fit_generator(
train_generator,
steps_per_epoch = 100,
epochs = 50, # was 50
validation_data = validation_generator,
validation_steps = 50
)
# Plot --------------------------------------------------------------------
# plot(history)
评估,预测概率代码是这个
# Check classes of data --------------------------------------------------
train_generator$class_indices
# Evaluate ----------------------------------------------------------------
model %>% evaluate_generator(test_generator, steps = 50)
#$loss
#[1] 0.3161949
#$acc
#[1] 0.932
predict <- model %>%
predict_generator(test_generator, step = 50, verbose = 1)
predict
的输出是:
as.tibble(predict) %>%
rename(predict_proba = V1) %>%
mutate(label = ifelse(predict_proba > 0.5, 1, 0)) %>%
mutate(label = as.integer(label)) %>%
mutate(label_name = ifelse(label == 0, "cat", "dog")) %>%
head(n=5)
# A tibble: 5 x 3
# predict_proba label label_name
# <dbl> <int> <chr>
#1 1.000000e+00 1 dog
#2 4.278725e-02 0 cat
#3 4.198529e-15 0 cat
#4 8.683033e-06 0 cat
#5 1.000000e+00 1 dog
我的问题是基于predict
我想知道的概率
存储在test_dir
?
基本上我想通过眼睛检查概率分配是否正确和合理。
我的base_dir
目录的结构如下:
.
|-- model
|-- test
| |-- cats
| `-- dogs
|-- train
| |-- cats
| `-- dogs
`-- validation
|-- cats
`-- dogs
更新
我试过[sladomic]的建议。并制作以下代码:
stat_df <- as.tibble(cbind(predict, test_generator$filenames)) %>%
# assign prediction probability for filenames
rename(
predict_proba = V1,
filename = V2
) %>%
mutate(predicted_label = ifelse(predict_proba > 0.5, 1, 0)) %>%
mutate(predicted_label = as.integer(predicted_label)) %>%
mutate(predicted_label_name = ifelse(predicted_label == 0, "cats", "dogs")) %>%
# image name is the true label name
separate(filename, into=c("true_label","fname"), sep = "[//]" )
stat_df
我得到以下数据框:
> stat_df
# A tibble: 1,000 x 5
predict_proba true_label fname predicted_label predicted_label_name
* <chr> <chr> <chr> <int> <chr>
1 2.45413622756985e-09 cats cat.1501.jpg 1 dogs
2 4.18112916275648e-20 cats cat.1502.jpg 1 dogs
3 1.25922511529097e-07 cats cat.1503.jpg 1 dogs
4 3.76460201987477e-14 cats cat.1504.jpg 1 dogs
5 6.77461059694906e-07 cats cat.1505.jpg 1 dogs
6 0.000256105791777372 cats cat.1506.jpg 0 cats
7 0.959224164485931 cats cat.1507.jpg 1 dogs
8 0.000318235805025324 cats cat.1508.jpg 0 cats
9 9.03555774129927e-05 cats cat.1509.jpg 1 dogs
10 2.40483113884693e-05 cats cat.1510.jpg 1 dogs
我检查了1000张测试图像,这些是预测标签的比例:
> stat_df %>% group_by(predicted_label_name) %>% summarise(n=n())
# A tibble: 2 x 2
predicted_label_name n
<chr> <int>
1 cats 191
2 dogs 809
我检查正确预测为狗或猫的预测数量
> stat_df %>% filter(true_label == predicted_label_name & true_label == "dogs") %>% dim()
[1] 439 5
> stat_df %>% filter(true_label == predicted_label_name & true_label == "cats") %>% dim()
[1] 130 5
其中说809预测只有439被正确预测为狗(准确率约为54%)。这是为什么?我错过了什么?
请注意,evaluate_generator()
的准确率约为93%。
什么是正确的解释?
答案 0 :(得分:2)
test_generator$filenames
为您提供了文件名列表
答案 1 :(得分:1)
OP解决方案。
通过添加mutate(predict_proba = as.double(predict_proba)
解决问题,此处:
stat_df <- as.tibble(cbind(predict, test_generator$filenames)) %>%
# assign prediction probability for filenames
rename(
predict_proba = V1,
filename = V2
) %>%
mutate(predict_proba = as.double(predict_proba) %>%
mutate(predicted_label = ifelse(predict_proba > 0.5, 1, 0)) %>%
mutate(predicted_label = as.integer(predicted_label)) %>%
mutate(predicted_label_name = ifelse(predicted_label == 0, "cats", "dogs")) %>%
# image name is the true label name
separate(filename, into=c("true_label","fname"), sep = "[//]" )