Question

我正在尝试使用ENet深度学习模型对道路和非道路部分进行细分。我使用以下github链接：https://github.com/kwotsin/TensorFlow-ENet，其原始图像大小为340X480，并且对于340X480或更高版本的图像都可以正常工作，但是一旦我重新确定图像的大小，它将无法正常工作。它显示了随机黑白像素的残影。即使我尝试以相同的长宽比减小尺寸，但仍然无法正常工作。

这是我的ENet模型结构代码：

#Now actually start building the network
def ENet(inputs,
         num_classes,
         batch_size,
         num_initial_blocks=1,
         stage_two_repeat=2,
         skip_connections=True,
         reuse=None,
         is_training=True,
         scope='ENet'):
    '''
    The ENet model for real-time semantic segmentation!
    INPUTS:
    - inputs(Tensor): a 4D Tensor of shape [batch_size, image_height, image_width, num_channels] that represents one batch of preprocessed images.
    - num_classes(int): an integer for the number of classes to predict. This will determine the final output channels as the answer.
    - batch_size(int): the batch size to explictly set the shape of the inputs in order for operations to work properly.
    - num_initial_blocks(int): the number of times to repeat the initial block.
    - stage_two_repeat(int): the number of times to repeat stage two in order to make the network deeper.
    - skip_connections(bool): if True, add the corresponding encoder feature maps to the decoder. They are of exact same shapes.
    - reuse(bool): Whether or not to reuse the variables for evaluation.
    - is_training(bool): if True, switch on batch_norm and prelu only during training, otherwise they are turned off.
    - scope(str): a string that represents the scope name for the variables.
    OUTPUTS:
    - net(Tensor): a 4D Tensor output of shape [batch_size, image_height, image_width, num_classes], where each pixel has a one-hot encoded vector
                      determining the label of the pixel.
    '''
    #Set the shape of the inputs first to get the batch_size information
    inputs_shape = inputs.get_shape().as_list()
    inputs.set_shape(shape=(batch_size, inputs_shape[1], inputs_shape[2], inputs_shape[3]))

    with tf.variable_scope(scope, reuse=reuse):
        #Set the primary arg scopes. Fused batch_norm is faster than normal batch norm.
        with slim.arg_scope([initial_block, bottleneck], is_training=is_training),\
             slim.arg_scope([slim.batch_norm], fused=True), \
             slim.arg_scope([slim.conv2d, slim.conv2d_transpose], activation_fn=None): 
            #=================INITIAL BLOCK=================
            net = initial_block(inputs, scope='initial_block_1')
            for i in xrange(2, max(num_initial_blocks, 1) + 1):
                net = initial_block(net, scope='initial_block_' + str(i))

            #Save for skip connection later
            if skip_connections:
                net_one = net

            #===================STAGE ONE=======================
            net, pooling_indices_1, inputs_shape_1 = bottleneck(net, output_depth=64, filter_size=3, regularizer_prob=0.01, downsampling=True, scope='bottleneck1_0')
            net = bottleneck(net, output_depth=64, filter_size=3, regularizer_prob=0.01, scope='bottleneck1_1')
            net = bottleneck(net, output_depth=64, filter_size=3, regularizer_prob=0.01, scope='bottleneck1_2')
            net = bottleneck(net, output_depth=64, filter_size=3, regularizer_prob=0.01, scope='bottleneck1_3')
            net = bottleneck(net, output_depth=64, filter_size=3, regularizer_prob=0.01, scope='bottleneck1_4')

            #Save for skip connection later
            if skip_connections:
                net_two = net

            #regularization prob is 0.1 from bottleneck 2.0 onwards
            with slim.arg_scope([bottleneck], regularizer_prob=0.1):
                net, pooling_indices_2, inputs_shape_2 = bottleneck(net, output_depth=128, filter_size=3, downsampling=True, scope='bottleneck2_0')

                #Repeat the stage two at least twice to get stage 2 and 3:
                for i in xrange(2, max(stage_two_repeat, 2) + 2):
                    net = bottleneck(net, output_depth=128, filter_size=3, scope='bottleneck'+str(i)+'_1')
                    net = bottleneck(net, output_depth=128, filter_size=3, dilated=True, dilation_rate=2, scope='bottleneck'+str(i)+'_2')
                    net = bottleneck(net, output_depth=128, filter_size=5, asymmetric=True, scope='bottleneck'+str(i)+'_3')
                    net = bottleneck(net, output_depth=128, filter_size=3, dilated=True, dilation_rate=4, scope='bottleneck'+str(i)+'_4')
                    net = bottleneck(net, output_depth=128, filter_size=3, scope='bottleneck'+str(i)+'_5')
                    net = bottleneck(net, output_depth=128, filter_size=3, dilated=True, dilation_rate=8, scope='bottleneck'+str(i)+'_6')
                    net = bottleneck(net, output_depth=128, filter_size=5, asymmetric=True, scope='bottleneck'+str(i)+'_7')
                    net = bottleneck(net, output_depth=128, filter_size=3, dilated=True, dilation_rate=16, scope='bottleneck'+str(i)+'_8')

            with slim.arg_scope([bottleneck], regularizer_prob=0.1, decoder=True):
                #===================STAGE FOUR========================
                bottleneck_scope_name = "bottleneck" + str(i + 1)

                #The decoder section, so start to upsample.
                net = bottleneck(net, output_depth=64, filter_size=3, upsampling=True,
                                 pooling_indices=pooling_indices_2, output_shape=inputs_shape_2, scope=bottleneck_scope_name+'_0')

                #Perform skip connections here
                if skip_connections:
                    net = tf.add(net, net_two, name=bottleneck_scope_name+'_skip_connection')

                net = bottleneck(net, output_depth=64, filter_size=3, scope=bottleneck_scope_name+'_1')
                net = bottleneck(net, output_depth=64, filter_size=3, scope=bottleneck_scope_name+'_2')

                #===================STAGE FIVE========================
                bottleneck_scope_name = "bottleneck" + str(i + 2)

                net = bottleneck(net, output_depth=16, filter_size=3, upsampling=True,
                                 pooling_indices=pooling_indices_1, output_shape=inputs_shape_1, scope=bottleneck_scope_name+'_0')

                #perform skip connections here
                if skip_connections:
                    net = tf.add(net, net_one, name=bottleneck_scope_name+'_skip_connection')

                net = bottleneck(net, output_depth=16, filter_size=3, scope=bottleneck_scope_name+'_1')

            #=============FINAL CONVOLUTION=============
            logits = slim.conv2d_transpose(net, num_classes, [2,2], stride=2, scope='fullconv')
            probabilities = tf.nn.softmax(logits, name='logits_to_softmax')

        return logits, probabilities

这是代码的完整链接：https://github.com/kwotsin/TensorFlow-ENet/blob/master/enet.py

预测细分代码：

image_dir = './dataset/test/'
images_list = sorted([os.path.join(image_dir, file) for file in os.listdir(image_dir) if file.endswith('.png')])

checkpoint_dir = "log/original/"
listi = os.listdir(checkpoint_dir)
print(images_list)
checkpoint = tf.train.latest_checkpoint("/home/nikhil_m/TensorFlow-ENet/log/original")
print(tf.train.latest_checkpoint("/home/nikhil_m/TensorFlow-ENet/log/original"),'-DDD--------------------------------------++++++++++++++++++++++++++++++++++++++++++++++++++++')
num_initial_blocks = 1
skip_connections = False
stage_two_repeat = 2
'''
#Labels to colours are obtained from here:
https://github.com/alexgkendall/SegNet-Tutorial/blob/c922cc4a4fcc7ce279dd998fb2d4a8703f34ebd7/Scripts/test_segmentation_camvid.py

However, the road_marking class is collapsed into the road class in the dataset provided.

Classes:
------------
Sky = [128,128,128]
Building = [128,0,0]
Pole = [192,192,128]
Road_marking = [255,69,0]
Road = [128,64,128]
Pavement = [60,40,222]
Tree = [128,128,0]
SignSymbol = [192,128,128]
Fence = [64,64,128]
Car = [64,0,128]
Pedestrian = [64,64,0]
Bicyclist = [0,128,192]
Unlabelled = [0,0,0]
'''
label_to_colours =    {0: [128,128,128],
                       1: [0, 0, 0]}

#Create the photo directory
photo_dir = checkpoint_dir + "/test_images"
if not os.path.exists(photo_dir):
    os.mkdir(photo_dir)

#Create a function to convert each pixel label to colour.
def grayscale_to_colour(image):
    print 'Converting image...'
    image = image.reshape((256, 256, 1))
    image = np.repeat(image, 3, axis=-1)
    for i in xrange(image.shape[0]):
        for j in xrange(image.shape[1]):
            label = int(image[i][j][0])
            image[i][j] = np.array(label_to_colours[label])

    return image


with tf.Graph().as_default() as graph:
    images_tensor = tf.train.string_input_producer(images_list, shuffle=False)
    reader = tf.WholeFileReader()
    key, image_tensor = reader.read(images_tensor)
    image = tf.image.decode_png(image_tensor, channels=3)
    print(image.shape, 'newwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww shapeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee')
    # image = tf.image.resize_image_with_crop_or_pad(image, 360, 480)
    # image = tf.cast(image, tf.float32)
    image = preprocess(image)
    images = tf.train.batch([image], batch_size = 10, allow_smaller_final_batch=True)

    #Create the model inference
    with slim.arg_scope(ENet_arg_scope()):
        logits, probabilities = ENet(images,
                                     num_classes=2,
                                     batch_size=10,
                                     is_training=True,
                                     reuse=None,
                                     num_initial_blocks=num_initial_blocks,
                                     stage_two_repeat=stage_two_repeat,
                                     skip_connections=skip_connections)

    variables_to_restore = slim.get_variables_to_restore()
    saver = tf.train.Saver(variables_to_restore)
    def restore_fn(sess):
        return saver.restore(sess, checkpoint)

    predictions = tf.argmax(probabilities, -1)
    predictions = tf.cast(predictions, tf.float32)
    print 'HERE', predictions.get_shape()

    sv = tf.train.Supervisor(logdir=None, init_fn=restore_fn)

    with sv.managed_session() as sess:
        for i in xrange(len(images_list) / 10):
            segmentations = sess.run(predictions)
            # print segmentations.shape
            print(segmentations.shape, 'shape') 
            for j in xrange(segmentations.shape[0]):

                converted_image = grayscale_to_colour(segmentations[j])
                print 'Saving image %s/%s' %(i*10 + j, len(images_list))
                #plt.axis('off')
                #plt.imshow(converted_image)
                imsave(photo_dir + "/image_%s.png" %(i*10 + j), converted_image)
                # plt.show()

以下是完整的代码链接：https://github.com/kwotsin/TensorFlow-ENet/blob/master/predict_segmentation.py

Answer 1

您可以尝试此模型。它写在tf.keras

    import tensorflow as tf
    from tensorflow.keras.layers import *
    from tensorflow.keras.models import Model

    print('Tensorflow', tf.__version__)

    def initial_block(inp):
        inp1 = inp
        conv = Conv2D(filters=13, kernel_size=3, strides=2, padding='same', kernel_initializer='he_normal')(inp)
        pool = MaxPool2D(2)(inp1)
        concat = concatenate([conv, pool])
        return concat


    def encoder_bottleneck(inp, filters, name, dilation_rate=2, downsample=False, dilated=False, asymmetric=False, drop_rate=0.1):
        reduce = filters // 4
        down = inp
        kernel_stride = 1

        #Downsample
        if downsample:
            kernel_stride = 2
            pad_activations = filters - inp.shape.as_list()[-1]
            down = MaxPool2D(2)(down)
            down = Permute(dims=(1, 3, 2))(down)
            down = ZeroPadding2D(padding=((0, 0), (0, pad_activations)))(down)
            down = Permute(dims=(1, 3, 2))(down)

        #1*1 Reduce
        x = Conv2D(filters=reduce, kernel_size=kernel_stride, strides=kernel_stride, padding='same', use_bias=False, kernel_initializer='he_normal', name=f'{name}_reduce')(inp)
        x = BatchNormalization(momentum=0.1)(x)
        x = PReLU(shared_axes=[1, 2])(x)

        #Conv
        if not dilated and not asymmetric:
            x = Conv2D(filters=reduce, kernel_size=3, padding='same', kernel_initializer='he_normal', name=f'{name}_conv_reg')(x)
        elif dilated:
            x = Conv2D(filters=reduce, kernel_size=3, padding='same', dilation_rate=dilation_rate, kernel_initializer='he_normal', name=f'{name}_reduce_dilated')(x)
        elif asymmetric:
            x = Conv2D(filters=reduce, kernel_size=(1,5), padding='same', use_bias=False, kernel_initializer='he_normal', name=f'{name}_asymmetric')(x)
            x = Conv2D(filters=reduce, kernel_size=(5,1), padding='same', kernel_initializer='he_normal', name=name)(x)
        x = BatchNormalization(momentum=0.1)(x)
        x = PReLU(shared_axes=[1, 2])(x)

        #1*1 Expand
        x = Conv2D(filters=filters, kernel_size=1, padding='same', use_bias=False, kernel_initializer='he_normal', name=f'{name}_expand')(x)
        x = BatchNormalization(momentum=0.1)(x)
        x = SpatialDropout2D(rate=drop_rate)(x)

        concat = Add()([x, down])
        concat = PReLU(shared_axes=[1, 2])(concat)
        return concat


    def decoder_bottleneck(inp, filters, name, upsample=False):
        reduce = filters // 4
        up = inp

        #Upsample
        if upsample:
            up = Conv2D(filters=filters, kernel_size=1, strides=1, padding='same', use_bias=False, kernel_initializer='he_normal', name=f'{name}_upsample')(up)
            up = UpSampling2D(size=2)(up)

        #1*1 Reduce
        x = Conv2D(filters=reduce, kernel_size=1, strides=1, padding='same', use_bias=False, kernel_initializer='he_normal', name=f'{name}_reduce')(inp)
        x = BatchNormalization(momentum=0.1)(x)
        x = PReLU(shared_axes=[1, 2])(x)

        #Conv
        if not upsample:
            x = Conv2D(filters=reduce, kernel_size=3, strides=1, padding='same', kernel_initializer='he_normal', name=f'{name}_conv_reg')(x)
        else:
            x = Conv2DTranspose(filters=reduce, kernel_size=3, strides=2, padding='same', kernel_initializer='he_normal', name=f'{name}_transpose')(x)
        x = BatchNormalization(momentum=0.1)(x)
        x = PReLU(shared_axes=[1, 2])(x)

        #1*1 Expand
        x = Conv2D(filters=filters, kernel_size=1, strides=1, padding='same', use_bias=False, kernel_initializer='he_normal', name=f'{name}_expand')(x)
        x = BatchNormalization(momentum=0.1)(x)

        concat = Add()([x, up])
        concat = ReLU()(concat)

        return concat


    def ENet(H, W, nclasses):
        '''
        Args:
            H: Height of the image
            W: Width of the image
            nclasses: Total no of classes

        Returns:
            model: Keras model in .h5 format
        '''

        inp = Input(shape=(H, W, 3))
        enc = initial_block(inp)

        #Bottleneck 1.0
        enc = encoder_bottleneck(enc, 64, name='enc1', downsample=True, drop_rate=0.001)

        enc = encoder_bottleneck(enc, 64, name='enc1.1', drop_rate=0.001)
        enc = encoder_bottleneck(enc, 64, name='enc1.2', drop_rate=0.001)
        enc = encoder_bottleneck(enc, 64, name='enc1.3', drop_rate=0.001)
        enc = encoder_bottleneck(enc, 64, name='enc1.4', drop_rate=0.001)

        #Bottleneck 2.0
        enc = encoder_bottleneck(enc, 128, name='enc2.0', downsample=True)
        enc = encoder_bottleneck(enc, 128, name='enc2.1')
        enc = encoder_bottleneck(enc, 128, name='enc2.2', dilation_rate=2, dilated=True)
        enc = encoder_bottleneck(enc, 128, name='enc2.3', asymmetric=True)
        enc = encoder_bottleneck(enc, 128, name='enc2.4', dilation_rate=4, dilated=True)
        enc = encoder_bottleneck(enc, 128, name='enc2.5')
        enc = encoder_bottleneck(enc, 128, name='enc2.6', dilation_rate=8, dilated=True)
        enc = encoder_bottleneck(enc, 128, name='enc2.7', asymmetric=True)
        enc = encoder_bottleneck(enc, 128, name='enc2.8', dilation_rate=16, dilated=True)

        #Bottleneck 3.0
        enc = encoder_bottleneck(enc, 128, name='enc3.0')
        enc = encoder_bottleneck(enc, 128, name='enc3.1', dilation_rate=2, dilated=True)
        enc = encoder_bottleneck(enc, 128, name='enc3.2', asymmetric=True)
        enc = encoder_bottleneck(enc, 128, name='enc3.3', dilation_rate=4, dilated=True)
        enc = encoder_bottleneck(enc, 128, name='enc3.4')
        enc = encoder_bottleneck(enc, 128, name='enc3.5', dilation_rate=8, dilated=True)
        enc = encoder_bottleneck(enc, 128, name='enc3.6', asymmetric=True)
        enc = encoder_bottleneck(enc, 128, name='enc3.7', dilation_rate=16, dilated=True)

        #Bottleneck 4.0
        dec = decoder_bottleneck(enc, 64, name='dec4.0', upsample=True)
        dec = decoder_bottleneck(dec, 64, name='dec4.1')
        dec = decoder_bottleneck(dec, 64, name='dec4.2')

        #Bottleneck 5.0
        dec = decoder_bottleneck(dec, 16, name='dec5.0', upsample=True)
        dec = decoder_bottleneck(dec, 16, name='dec5.1')

        dec = Conv2DTranspose(filters=nclasses, kernel_size=2, strides=2, padding='same', kernel_initializer='he_normal', name='fullconv')(dec)
        dec = Activation('softmax')(dec)

        model = Model(inputs=inp, outputs=dec, name='Enet')
        model.save(f'enet_{nclasses}.h5')
        return model

ENet预设分割模型不适用于较小的图像

1 个答案: