Question

我正在 Pytorch 中研究图像分类模型。设置如下：我的训练实例是图像包（即一个训练实例 = 一个包），其中每个包包含不同数量的图像。每个包都有一个与之关联的标签（0 或 1），表示是否至少有一个图像包含某个属性（例如，在我的情况下是肿瘤）。目标是学习一个分类器，尽可能准确地对新袋子进行分类。然而，当我尝试训练我的模型时会出现以下问题：当我将袋子一个一个地输入 CNN 架构时，预测（袋子具有标签 1 的概率）立即为 0 或 1。

现在，在我深入研究代码本身之前，它很长，我有一个类似的设置，我使用数字代替组织图像。所以每个包都包含许多类似 MNIST 的图像（只是带有数字的图片），如果包包含数字 9，则该包将获得一个正标签（即 1）。奇怪的是，这个带有数字的任务非常有效（你可以看到学习是有效的，最终获得了良好的分类性能），即使与组织图像相比的设置几乎相同。

下面我发布了这两个任务之间不同的代码部分，但它们本质上如下：袋子的输入形状不同，数字图像小得多（28x28）并且只有一个通道，而组织图像是224x224并且拥有三个通道。因此，卷积层的规格也略有不同。任何有关问题可能是什么的帮助将不胜感激！

第一个代码部分是组织图像，它的问题是该模型无法学习

class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
        self.L = 500
        self.D = 128
        self.K = 1


        self.feature_extractor_part1 = nn.Sequential(
            nn.Conv2d(3, 4, kernel_size=4), # 3 because three color channels, each kernel has size 3X4X4
            nn.ReLU(),
            nn.MaxPool2d(2, stride=2),
            nn.Conv2d(4, 8, kernel_size=3), # combine all input for one output
            nn.ReLU(),
            nn.MaxPool2d(2, stride=2)
        )

        self.feature_extractor_part2 = nn.Sequential(
            nn.Linear(8 * 54 * 54, self.L),    # y = Ax + b
            nn.ReLU(),
            #add dropout
        )

        self.attention = nn.Sequential(
            nn.Linear(self.L, self.D),
            nn.Tanh(),
            nn.Linear(self.D, self.K)
        )

        self.classifier = nn.Sequential(
            nn.Linear(self.L*self.K, 1),
            nn.Sigmoid()
        )

    # X is input and is one bag
    def forward(self, x):
        x = x.squeeze(0) #remove first dimension the bag tensor

        # feature extraction part
        H = self.feature_extractor_part1(x)  
        H = H.view(-1, 8 * 54 * 54)  
        H = self.feature_extractor_part2(H)  # NxL
        

        # aggregation part
        A = self.attention(H) 
        A = torch.transpose(A, 1, 0)  # KxN
        A = F.softmax(A, dim=1)  # softmax over N

        # H gets multiplied with A, where A is some kind of multiplied H
        M = torch.mm(A, H)  # KxL #so KxL is the feature of the bag
        print(M.shape) #torch.Size([1, 500])
      

        # final transformation part
        Y_prob = self.classifier(M)  # KxL to a one dim output for probability bag label
        Y_hat = torch.ge(Y_prob, 0.5).float()
 

        return Y_prob, Y_hat, A

第二个代码部分是数字图像，效果很好


class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
        self.L = 500
        self.D = 128
        self.K = 1

        self.feature_extractor_part1 = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=5), # 1 because one color channel, 20 output feature #20
            nn.ReLU(),
            nn.MaxPool2d(2, stride=2),
            nn.Conv2d(10, 20, kernel_size=5), #50
            nn.ReLU(),
            nn.MaxPool2d(2, stride=2)
        )

        self.feature_extractor_part2 = nn.Sequential(
            nn.Linear(20 * 4 * 4, self.L),    #y= Ax + b   #50 feature maps and size 4x4
            nn.ReLU(),
        )

        self.attention = nn.Sequential(
            nn.Linear(self.L, self.D),
            nn.Tanh(),
            nn.Linear(self.D, self.K)
        )

        self.classifier = nn.Sequential(
            nn.Linear(self.L*self.K, 1),
            nn.Sigmoid()
        )

    # X is input and is one bag
    def forward(self, x):
        
        x = x.squeeze(0) #remove first dimension the bag tensor

        # feature extraction part
        H = self.feature_extractor_part1(x)  
        H = H.view(-1, 20 * 4 * 4) 
        H = self.feature_extractor_part2(H)  # NxL

        # aggregation part
        A = self.attention(H)  # NxK
        A = torch.transpose(A, 1, 0)  # KxN
        A = F.softmax(A, dim=1)  # softmax over N

        # H gets multiplied with A, where A is some kind of multiplied H
        M = torch.mm(A, H)  # KxL #so KxL is the feature of the bag
        
        # final transformation part
        Y_prob = self.classifier(M)  # KxL to a one dim output for probability bag label
        Y_hat = torch.ge(Y_prob, 0.5).float()

        return Y_prob, Y_hat, A

Pytorch 中的多实例图像分类：模型不会学习

0 个答案: