Question

问题：我正在 Google Colab 上使用 TensorFlow 2.5 进行多类图像分类。我收到了三种不同的分类准确度值，但我不知道应该相信哪一个，也不知道为什么它们不同。

演示：

当我在测试集上评估时，我收到了 accuracy_1

29/29 [==============================] - 5 秒 147 毫秒/步 - 损失：1.1036 - 准确度： 0.3186

当我在测试集上预测时，我收到了 accuracy_2，即 0.22

          precision    recall  f1-score   support

       0       0.69      0.12      0.21      1305
       1       0.15      0.78      0.26       272
       2       0.14      0.13      0.13       231

accuracy                           0.22      1808
macro avg      0.33      0.34      0.20      1808
weighted avg   0.54      0.22      0.20      1808

这是我如何得到 accuracy_3，其值为 0.2129424778761062 :

from sklearn.metrics import accuracy_score
prediction = np.argmax(detector.predict(test_dataset), axis=1)
accuracy_3 = accuracy_score(
    np.concatenate([label.numpy() for image, label in test_dataset.take(-1)]),
    prediction
))

我发现如果我多次运行计算 accuracy_3 的代码块。我每次都会得到不同的结果，但它们与 accuracy_2 相差很大，即 0.22。下面是计算 accuracy_1 和 accuracy_2 的代码：

from tensorflow.keras.callbacks import Callback

class Peek(Callback):

    def on_epoch_begin(self, epoch, logs=None):
      current_decayed_lr = self.model.optimizer._decayed_lr(tf.float32).numpy()
      print(f"Current learning rate: {current_decayed_lr}")

    def on_epoch_end(self, epoch, logs=None):
      print("Evaluating...")
      self.model.evaluate(test_dataset, verbose=1) # calculates accuracy_1
      print("Predicting...")
      predictions = np.argmax(self.model.predict(test_dataset), axis=1)
      true_categories = np.array([label.numpy() for image, label in test_dataset.unbatch()])
      print(classification_report(true_categories, predictions)) # calculates accuracy_2

accuracy_2 和 accuracy_3 之间的差异更有可能是由于随机机会造成的，但 accuracy_1 比其他两个大得多。我在 stackoverflow 上搜索，有些帖子说差异可能是由于创建测试集时 shuffle=True 中的 ImageDataGenerator。我的情况不同，因为我不是为了性能而使用 ImageDataGenerator。我正在使用 TFRecords 加载数据，这里是完整的代码。

import os
import math
import numpy as np
import tensorflow as tf
from glob import glob
from progressbar import progressbar
from os.path import basename, exists
from tensorflow.sparse import to_dense
from tensorflow.data import Dataset, Options, TFRecordDataset
from tensorflow.image import decode_jpeg, encode_jpeg, resize
from tensorflow.train import Feature, Features, BytesList, Int64List, FloatList, Example
from tensorflow.io import read_file, TFRecordWriter, FixedLenFeature, VarLenFeature, parse_single_example
from tensorflow.data.experimental import AUTOTUNE

class DataLoader:

  def __init__(self, subset_name):
    self.subset_name = subset_name
    self.file_pattern = glob(
        f"./dataset/{self.subset_name}/**/*.jpg",
        recursive=True
    )
    self.target_size = (224, 224) 
    self.classes = [b"Negative", b"Positive", b"Unreadable"]
    self.n_images = len(self.file_pattern)
    self.n_shards = 32
    self.write_shard_size = math.ceil(1.0 * self.n_images / self.n_shards)
    self.read_shard_size = 64
    self.output_dir = f"tfrecords-jpeg-{subset_name}-{'x'.join(map(lambda x: str(x), self.target_size))}"

  def fetch_image_and_label(self, filename):
    bits = read_file(filename)
    image = decode_jpeg(bits)
    image = resize(image, self.target_size)
    height = tf.shape(image)[0]
    width = tf.shape(image)[1]
    image = tf.cast(image, tf.uint8)
    image = encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
    label = tf.expand_dims(filename, axis=-1)
    label = tf.strings.split(label, sep="/")
    label = label.values[-2]
    return image, label, height, width
  
  @staticmethod
  def _bytestring_feature(list_of_bytestrings):
    return Feature(bytes_list=BytesList(value=list_of_bytestrings))

  @staticmethod
  def _int_feature(list_of_ints):
    return Feature(int64_list=Int64List(value=list_of_ints))
  
  @staticmethod
  def _float_feature(list_of_floats):
    return Feature(float_list=FloatList(value=list_of_floats))

  def to_tfrecord(self, tfrec_filewriter, img_bytes, label, height, width):
    class_num = np.argmax(np.array(self.classes) == label)
    one_hot_class = np.eye(len(self.classes))[class_num]
    feature = {
        "image": self._bytestring_feature([img_bytes]),
        "class": self._int_feature([class_num]),
        "label": self._bytestring_feature([label]),
        "size": self._int_feature([height, width]),
        "one_hot_class": self._float_feature(one_hot_class.tolist())
    }
    return Example(features=Features(feature=feature))

  def write_records(self):
    print(f"{self.n_images} images, {self.n_shards} shards with {self.write_shard_size} images each.")
    filenames = Dataset.list_files(self.file_pattern, seed=35155)
    dataset = filenames.map(self.fetch_image_and_label, num_parallel_calls=AUTOTUNE).batch(self.write_shard_size)
    if not exists(self.output_dir):
      os.mkdir(self.output_dir)
    print("Writing TFRecords...")
    for shard, (image, label, height, width) in enumerate(dataset):
      shard_size = image.numpy().shape[0]
      filename = f"{self.output_dir}/{str(shard).zfill(2)}-{shard_size}.tfrec"
      with TFRecordWriter(filename) as out_file:
        for i in progressbar(range(shard_size)):
          example = self.to_tfrecord(
            out_file,
            image.numpy()[i],
            label.numpy()[i],
            height.numpy()[i],
            width.numpy()[i]
          )
          out_file.write(example.SerializeToString())
      print(f"Wrote file {filename} containing {shard_size} records")

  def _read_tfrecord(self, example):
    features = {
        "image": FixedLenFeature([], tf.string),
        "class": FixedLenFeature([], tf.int64),
        "label": FixedLenFeature([], tf.string),
        "size": FixedLenFeature([2], tf.int64),
        "one_hot_class": VarLenFeature(tf.float32)
    }
    example = parse_single_example(example, features)
    image = decode_jpeg(example["image"], channels=3)
    image = tf.reshape(image, [*self.target_size, 3])
    class_num = example["class"]
    label = example["label"]
    height = example["size"][0]
    width = example["size"][1]
    one_hot_class = to_dense(example["one_hot_class"])
    # return image, class_num, label, height, width, one_hot_class
    # return only image and class_num because we're classifying images
    return image, class_num

  def read_records(self):
    from tensorflow.io.gfile import glob
    option_no_order = Options()
    option_no_order.experimental_deterministic = False
    filenames = glob(f"{self.output_dir}/*.tfrec")
    dataset = TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
    dataset = dataset.with_options(option_no_order)
    dataset = dataset.map(self._read_tfrecord, num_parallel_calls=AUTOTUNE)
    dataset = dataset.shuffle(10000)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(self.read_shard_size)
    return dataset

train_loader = DataLoader("train")
validation_loader = DataLoader("validation")
test_loader = DataLoader("test")
train_dataset = train_loader.read_records()
validation_dataset = validation_loader.read_records()
test_dataset = test_loader.read_records()
train_dataset = train_dataset.concatenate(validation_dataset)

accuracy_2 和 accuracy_3 之间的区别仍然存在，而且每次运行计算 accuracy_3 的代码块时，accuracy_3 仍然会发生变化，即使在删除了 dataset = dataset.shuffle(10000) 之后来自 def read_records(self) 班的 DataLoader。

我还将粘贴有关如何实例化和编译模型的代码以提供更多背景信息。

from tensorflow.keras import Input, Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.applications.densenet import DenseNet201
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.applications.densenet import preprocess_input

def create_model():
  feature_extractor = DenseNet201(
      weights="imagenet",
      input_shape=(224, 224, 3),
      include_top=False
  )
  feature_extractor.trainable = True
  inputs = Input([224, 224, 3])
  x = preprocess_input(inputs)
  x = feature_extractor(x)
  x = GlobalAveragePooling2D()(x)
  x = Dense(32, activation="elu")(x)
  x = Dropout(0.8)(x)
  outputs = Dense(3, activation="softmax")(x)
  detector = Model(inputs, outputs)
  detector.compile(
      optimizer=SGD(learning_rate=0.001, momentum=0.9),
      loss=["sparse_categorical_crossentropy"],
      metrics=["sparse_categorical_accuracy"]
  )
  return detector

detector = create_model()

peek = Peek()
detector.fit(
    train_dataset,
    epochs=1,
    validation_data=test_dataset,
    class_weight=class_weight,
    callbacks=[peek],
)

我应该相信哪种准确性？模型.评估还是模型.预测？

0 个答案: