我正在尝试使用gpu在CIFAR10数据集上训练ResNet架构。这是我的ResNet代码:
import torch
import torch.nn as nn
import torch.nn.functional as F
class ResNetBlock(nn.Module):
def __init__(self, in_planes, planes, stride=1):
super(ResNetBlock, self).__init__()
self.stride = stride
self.in_planes=in_planes
self.planes = planes
if stride!=1:
self.fx = nn.Sequential(nn.Conv2d(in_planes, planes, 3, stride=2,
padding=1),
nn.ReLU(),
nn.Conv2d(planes, planes,3, padding=1))
else:
self.fx = nn.Sequential(nn.Conv2d(planes, planes, 3, padding = 1),
nn.ReLU(),
nn.Conv2d(planes, planes,3, padding=1))
def forward(self, x):
if self.stride ==1:
fx = self.fx(x)
id = nn.Sequential()
out = fx + id(x)
relu = nn.ReLU()
return relu(out)
else:
fx = self.fx(x)
id = nn.Conv2d(self.in_planes, self.planes, 2, stride = 2)
out = fx + id(x)
relu = nn.ReLU()
return relu(out)
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10, num_filters=16, input_dim=3):
super(ResNet, self).__init__()
self.in_planes = num_filters
self.conv1 = nn.Conv2d(input_dim, num_filters, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(num_filters)
layers = []
plane = num_filters
for nb in num_blocks:
layer = self._make_layer(block,plane ,nb,2)
layers.append(layer)
plane*=2
self.layers = nn.Sequential(*layers)
self.linear = nn.Linear(2304, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
layers = []
block1 = ResNetBlock(planes, 2*planes, stride = 2)
planes *=2
layers.append(block1)
for i in range(1,num_blocks):
block = ResNetBlock(planes, planes, stride =1)
layers.append(block)
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layers(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
# (1 + 2*(1 + 1) + 2*(1 + 1) + 2*(1 + 1) + 2*(1 + 1)) + 1 = 18
def ResNet18():
return ResNet(ResNetBlock, [2,2,2,2])
然后我使用gpu训练网络:
net = ResNet18()
net = net.to('cuda')
train2(net, torch.optim.Adam(net.parameters(), lr=0.001), trainloader, criterion, n_ep=3)
我得到了错误:
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same
这很烦人,因为由于resnet.cuda(),我的权重也应该是cuda。
在另一个网络中,火车功能运行良好,因此它必须来自上述类。
此外,next(resnet.parameters())。is_cuda返回True。
更新:这是我的训练功能。
def train(net, optimizer, trainload, criterion, n_ep=10, cuda = True):
if cuda:
net = net.to('cuda')
for epoch in range(n_ep):
for data in trainload:
inputs, labels = data
if cuda:
inputs = inputs.type(torch.cuda.FloatTensor)
labels = labels.type(torch.cuda.LongTensor)
optimizer.zero_grad()
print(next(net.parameters()).is_cuda)
## this actually prints "True" !
outputs = net.forward(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
return net
问题是,此训练功能可与另一种类型的网络很好地配合使用。例如使用了这个(AlexNet):
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(nn.Conv2d(3,64,11), nn.ReLU(),nn.MaxPool2d(2, stride = 2), nn.Conv2d(64,192,5),
nn.ReLU(), nn.MaxPool2d(2, stride = 2), nn.Conv2d(192,384,3),
nn.ReLU(),nn.Conv2d(384,256,3), nn.ReLU(), nn.Conv2d(256,256,3), nn.ReLU())
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.view(x.size(0), 256 * 6 * 6)
x = self.classifier(x)
return x
并且通过此培训,gpu培训效果很好。
还有其他我不明白的地方。我尝试使用未迁移到GPU(故意)的训练数据来训练我迁移到GPU的网络(使用.cuda())。这次我得到的错误是权重类型是torch.cuda而数据类型不是。
编辑:我认为这与使用nn.ModuleList而不是常规python列表有关。但是我尝试了一下,但并没有解决问题。
答案 0 :(得分:1)
我们需要您的训练循环摘要,以便更好地确定您的错误。
我假设在该循环的某个地方,您有一些执行以下操作的代码行:
for data, label in CifarDataLoader:
data, label = data.to('cuda'), label.to('cuda')
我的第一个猜测是在for循环之前添加一行->
resnet = resnet.to('cuda')
让我知道这是否可行,否则,我将需要更多代码来查找错误。
答案 1 :(得分:0)
好吧,我终于明白了。
我在ResNetBlock类的forward函数中定义了nn.Module对象。我猜那些不能移到gpu,因为pytorch仅在init函数中查找此类对象。我对实现进行了一些更改,以在 init 函数中定义对象,并且该方法可以正常工作。
谢谢您的帮助:)