如何解决“ RuntimeError:函数CudnnConvolutionBackward返回索引1处的无效渐变”

时间:2019-07-28 16:29:15

标签: computer-vision pytorch pruning

尽管我已经将模型和数据都移到了cuda,但是当我尝试删除一些过滤器然后重新训练网络时,我遇到了这个问题。当我调用loss.backward()时,该错误似乎是在向后;这是我的代码:

criterion = nn.NLLLoss()

optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9, 
                          weight_decay=0.0005)

def train(model, train_loader, criterion, optimizer, device, epoch):
    model.train()
    for idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output,target)
        loss.backward() #The problem is here
        optimizer.step()
        if idx % 5 == 0:
            print("Epoch {} [{}/{} ({:.2f}%)]\tLoss: {:.6f}"
                  .format(epoch,idx*len(data),len(train_loader.dataset),
                   100.0*idx/len(train_loader),loss.item()))

def test(model,test_loader,criterion,device):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data,target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output,target)
            test_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
        test_loss /= len(test_loader.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{}
              ({:.0f}%)\n'
              .format(test_loss, correct, len(test_loader.dataset),100. 
              * correct / len(test_loader.dataset)))

#This function returns the index of the filter to be pruned
def index_pruned(model, layer):
    l = []
    for i in range(model.features[layer].weight.size(0)):
        l.append(abs(model.features[layer].weight[i,:,:,:]).sum())
    return l.index(min(l))

def prune_filter(model, layer, next_layer, indexes): 
    #Making a new layer, with one less filter    
    new_conv = 
    torch.nn.Conv2d(in_channels=model.features[layer].in_channels, 

                    out_channels=model.features[layer].out_channels-1,

                    kernel_size=model.features[layer].kernel_size,
                    stride=model.features[layer].stride,
                    padding=model.features[layer].padding)

    #Pass the old weights to the new layer except in the pruned filter
    new_conv.weight[0:indexes,:,:,:] = 
           model.features[layer].weight[0:indexes,:,:,:]
    new_conv.weight[indexes:,:,:,:] = 
           model.features[layer].weight[indexes+1,:,:,:]
    new_conv.bias[0:indexes] = model.features[layer].bias[0:indexes]
    new_conv.bias[indexes:] = model.features[layer].bias[indexes+1:]
    #replace the layer
    model.features[layer] = new_conv

    #When not pruning the last layer
    if layer != 10:
        #Reduce the 'thickness' of the consecutive layer
        #as the previous one is pruned
        next_new_conv = torch.nn.Conv2d(
                 in_channels=model.features[next_layer].in_channels-1, 

                 out_channels=model.features[next_layer].out_channels,

                 kernel_size=model.features[next_layer].kernel_size,

                 stride=model.features[next_layer].stride,

                 padding=model.features[next_layer].padding)
                 next_new_conv.weight[:,0:indexes,:,:] = 
                 model.features[next_layer].weight[:,0:indexes,:,:]
        next_new_conv.weight[:,indexes:,:,:] = 
            model.features[next_layer].weight[:,indexes+1:,:,:]
        model.features[next_layer] = next_new_conv

    #Pruning the last layer affects the linear layer
    elif layer == 10:
        params = int(model.classifier[0].in_features / 
                        (model.features[10].out_channels+1))
        new_fc1 = torch.nn.Linear(
             in_features=int(model.classifier[0].in_features-params),

             out_features=int(model.classifier[0].out_features))
        new_fc1.weight[:,0:indexes*params] = 
                 model.classifier[0].weight[:,0:indexes*params]
        new_fc1.weight[:,:params*indexes] = 
                 model.classifier[0].weight[:,:(indexes+1)*params]
        new_fc1.bias = model.classifier[0].bias
        model.classifier[0]=new_fc1
    return model

def main(model, train_loader,test_loader,criterion,optimizer,
         pretrained=False,prune=False,save=False,pruneFilter=False):
    device = 'cuda'
    if pretrained == True:
        model.load_state_dict(torch.load('AlexNet_pruned.pt'))
        for params in model.parameters():
            params.requires_grad = True
    model.to(device)
    if pruneFilter == True:
        #conv0:
        for num_filters_pruned in range(16):
            model=prune_filter(model=model, layer=0, next_layer=3, indexes=index_pruned(model,0))
            if num_filters_pruned %4 == 0:
                model=model.cuda()
                train(model, train_loader, criterion, optimizer, device, 1)
                test(model,test_loader,criterion,device)

        torch.save(model.state_dict(),'AlexNet_filers_pruned.pt')

    if save == True:
        torch.save(model.state_dict(),'AlexNet.pt')
    if prune == True:
        torch.save(model.state_dict(),'AlexNet_pruned.pt')

错误在loss.backward()处。我试图检查每个参数是否都在cuda中

conv0:

for num_filters_pruned in range(16):
    model=prune_filter(model=model, layer=0, next_layer=3, indexes=index_pruned(model,0))
    if num_filters_pruned %4 == 0:
        model=model.cuda()
        for name, param in model.named_parameters():
            print(name,':',param.device)
        train(model, train_loader, criterion, optimizer, device, 1)
        test(model,test_loader,criterion,device)

And this is the result

features.0.weight : cuda:0 
features.0.bias : cuda:0 
features.3.weight : cuda:0 
features.3.bias : cuda:0 
features.6.weight : cuda:0 
features.6.bias : cuda:0 
features.8.weight : cuda:0 
features.8.bias : cuda:0 
features.10.weight : cuda:0 
features.10.bias : cuda:0 
classifier.fc1.weight : cuda:0 
classifier.fc1.bias : cuda:0 
classifier.fc2.weight : cuda:0 
classifier.fc2.bias : cuda:0
--------------------------------------------------------------------------- RuntimeError                              Traceback (most recent call last) <ipython-input-12-46a65917686e> in <module>
----> 1 main(model,train_loader,test_loader,criterion,optimizer,pretrained=True,prune=False,save=False,pruneFilter=True)

<ipython-input-11-8c11e06a650b> in main(model, train_loader, test_loader, criterion, optimizer, pretrained, prune, save, pruneFilter)
     33                     print(name,':',param.device)
     34 
---> 35                 train(model, train_loader, criterion, optimizer, device, 1)
     36                 test(model,test_loader,criterion,device)
     37         #conv2:

<ipython-input-8-f4fd4c83eff2> in train(model, train_loader, criterion, optimizer, device, epoch)
      9         loss = criterion(output,target)
     10         loss = loss.cuda()
---> 11         loss.backward()
     12         optimizer.step()
     13         if idx % 5 == 0:

/usr/local/lib/python3.5/dist-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    100                 products. Defaults to ``False``.
    101         """
--> 102         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    103 
    104     def register_hook(self, hook):

/usr/local/lib/python3.5/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     88     Variable._execution_engine.run_backward(
     89         tensors, grad_tensors, retain_graph, create_graph,
---> 90         allow_unreachable=True)  # allow_unreachable flag
     91 
     92 

RuntimeError: Function CudnnConvolutionBackward returned an invalid gradient at index 1 - expected type torch.FloatTensor but got torch.cuda.FloatTensor

0 个答案:

没有答案