第四周作业:卷积神经网络(part2)
一:ALexNet
可以理解为更深、更大的LeNet,主要改进:丢弃法、ReLu、MaxPooling
1.AlexNet 架构



2.代码实现:
(1).网路构建:
net = nn.Sequential( nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(), nn.Linear(6400, 4096), nn.ReLU(), nn.Dropout(p=0.5), nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5), nn.Linear(4096, 10))
(2).输出经过每一层后数据的大小:
X = torch.randn(1, 1, 224, 224) for layer in net: X=layer(X) print(layer.__class__.__name__,'Output shape:\t',X.shape)

(3)读取数据集并训练
batch_size = 128 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224) lr, num_epochs = 0.01, 10 d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

二.VGGNet
1.VGG块:
允许重复多次3*3得卷积层(padding=1),2*2最大池化层(stride=2)

2.代码实现:
(1).构建vgg块
def vgg_block(num_convs, in_channels, out_channels): layers = [] for _ in range(num_convs): layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)) layers.append(nn.ReLU()) in_channels = out_channels layers.append(nn.MaxPool2d(kernel_size=2,stride=2)) return nn.Sequential(*layers)
(2)定义网络、
#构建一个有五块的网络 conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512)) #vgg网络构建 def vgg(conv_arch): conv_blks = [] in_channels = 1 for (num_convs, out_channels) in conv_arch: conv_blks.append(vgg_block(num_convs, in_channels, out_channels)) in_channels = out_channels return nn.Sequential( *conv_blks, nn.Flatten(), nn.Linear(out_channels * 7 * 7, 4096), nn.ReLU(), nn.Dropout(0.5), nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(0.5), nn.Linear(4096, 10)) net = vgg(conv_arch)
(3)打印大小
X = torch.randn(size=(1, 1, 224, 224)) for blk in net: X = blk(X) print(blk.__class__.__name__,'output shape:\t',X.shape)

(4)训练及结果:
ratio = 4 small_conv_arch = [(pair[0], pair[1] // ratio) for pair in conv_arch] net = vgg(small_conv_arch) lr, num_epochs, batch_size = 0.05, 10, 128 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224) d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

三.NINNet
1.基本思想:不使用全连接层
2.基本结构:



NiN块:一个卷积层后跟两个全连接层,步幅为1,无填充(起到全连接层的作用)
NiN架构:交替使用NiN块和步幅为2的最大池化层,最后使用全局平均池化层得到输出
3.代码实现
(1)NiN块结构
def nin_block(in_channels, out_channels, kernel_size, strides, padding): return nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size, strides, padding), nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU())
(2)网络结构:
net = nn.Sequential( nin_block(1, 96, kernel_size=11, strides=4, padding=0), nn.MaxPool2d(3, stride=2), nin_block(96, 256, kernel_size=5, strides=1, padding=2), nn.MaxPool2d(3, stride=2), nin_block(256, 384, kernel_size=3, strides=1, padding=1), nn.MaxPool2d(3, stride=2), nn.Dropout(0.5), # 标签类别数是10 nin_block(384, 10, kernel_size=3, strides=1, padding=1), nn.AdaptiveAvgPool2d((1, 1)), # 将四维的输出转成二维的输出,其形状为(批量大小, 10) nn.Flatten())
(3)查看输出形状:
X = torch.rand(size=(1, 1, 224, 224)) for layer in net: X = layer(X) print(layer.__class__.__name__,'output shape:\t', X.shape)

(4)训练及结果:
lr, num_epochs, batch_size = 0.1, 10, 128 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224) d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

四:GoogLeNet
1.Inception块

与3*3和5*5的卷积层相比,Inception块有更少的参数个数和计算复杂度

2.googlenet
有5个stage(每个stage高宽减半),9个Inception块





stage1、2 stage3 stage4、5

3.后续变种:
Inception-V2:使用batch normalization
Inception V3: 替换5*5为多个3*3 ,替换5*5为1*7和7*1的卷积层,替换3*3为1*3和3*1的卷积层,更深
Inception V4:使用残差连接】
4.代码实现:
(1)Inception块
class Inception(nn.Module): def __init__(self, in_channels, c1, c2, c3, c4, **kwargs): super(Inception, self).__init__(**kwargs) # 线路1,单1 x 1卷积层 self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1) # 线路2,1 x 1卷积层后接3 x 3卷积层 self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1) self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1) # 线路3,1 x 1卷积层后接5 x 5卷积层 self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1) self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2) # 线路4,3 x 3最大汇聚层后接1 x 1卷积层 self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1) def forward(self, x): p1 = F.relu(self.p1_1(x)) p2 = F.relu(self.p2_2(F.relu(self.p2_1(x)))) p3 = F.relu(self.p3_2(F.relu(self.p3_1(x)))) p4 = F.relu(self.p4_2(self.p4_1(x))) return torch.cat((p1, p2, p3, p4), dim=1)
(2)Google的5个模块:
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=1), nn.ReLU(), nn.Conv2d(64, 192, kernel_size=3, padding=1), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) b3 = nn.Sequential(Inception(192, 64, (96, 128), (16, 32), 32), Inception(256, 128, (128, 192), (32, 96), 64), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) b4 = nn.Sequential(Inception(480, 192, (96, 208), (16, 48), 64), Inception(512, 160, (112, 224), (24, 64), 64), Inception(512, 128, (128, 256), (24, 64), 64), Inception(512, 112, (144, 288), (32, 64), 64), Inception(528, 256, (160, 320), (32, 128), 128), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128), Inception(832, 384, (192, 384), (48, 128), 128), nn.AdaptiveAvgPool2d((1,1)), nn.Flatten()) net = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10))
(3)输出形状:
X = torch.rand(size=(1, 1, 96, 96)) for layer in net: X = layer(X) print(layer.__class__.__name__,'output shape:\t', X.shape)
(4)模型训练:
lr, num_epochs, batch_size = 0.1, 10, 128 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

五.批量归一化:
1.核心想法:固定小批量里的均值和方差,然后再做额外的调整(可学习的参数)
2.作用在全连接层和卷积层输出上,激活函数前;作用在全连接层和卷积层的输入上
对全连接层,作用在特征维,对卷积层,作用在通道维
3.没必要跟dropout混合使用
4.代码实现:
(1).批量归一化层:
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum): if not torch.is_grad_enabled(): # 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差 X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps) else: assert len(X.shape) in (2, 4) if len(X.shape) == 2: # 使用全连接层的情况 mean = X.mean(dim=0) var = ((X - mean) ** 2).mean(dim=0) else: # 使用二维卷积层 mean = X.mean(dim=(0, 2, 3), keepdim=True) var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True) # 训练模式下,用当前的均值和方差做标准化 X_hat = (X - mean) / torch.sqrt(var + eps) # 更新移动平均的均值和方差 moving_mean = momentum * moving_mean + (1.0 - momentum) * mean moving_var = momentum * moving_var + (1.0 - momentum) * var Y = gamma * X_hat + beta return Y, moving_mean.data, moving_var.data
class BatchNorm(nn.Module): # `num_features`:完全连接层的输出数量或卷积层的输出通道数。 # `num_dims`:2表示完全连接层,4表示卷积层 def __init__(self, num_features, num_dims): super().__init__() if num_dims == 2: shape = (1, num_features) else: shape = (1, num_features, 1, 1) # 参与求梯度和迭代的拉伸和偏移参数,分别初始化成1和0 self.gamma = nn.Parameter(torch.ones(shape)) self.beta = nn.Parameter(torch.zeros(shape)) self.moving_mean = torch.zeros(shape) self.moving_var = torch.ones(shape) def forward(self, X): if self.moving_mean.device != X.device: self.moving_mean = self.moving_mean.to(X.device) self.moving_var = self.moving_var.to(X.device) # 保存更新过的 `moving_mean` 和 `moving_var` Y, self.moving_mean, self.moving_var = batch_norm( X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9) return Y
(2)使用批量归一化的Lenet
net = nn.Sequential( nn.Conv2d(1, 6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(6, 16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2), nn.Flatten(), nn.Linear(16*4*4, 120), BatchNorm(120, num_dims=2), nn.Sigmoid(), nn.Linear(120, 84), BatchNorm(84, num_dims=2), nn.Sigmoid(), nn.Linear(84, 10)) lr, num_epochs, batch_size = 1.0, 10, 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

六.残差网络
1.残差块

F(x)=x+g(x)
2.代码实现:
(1)残差快:
class Residual(nn.Module): def __init__(self, input_channels, num_channels, use_1x1conv=False, strides=1): super().__init__() self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1, stride=strides) self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1) if use_1x1conv: self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides) else: self.conv3 = None self.bn1 = nn.BatchNorm2d(num_channels) self.bn2 = nn.BatchNorm2d(num_channels) self.relu = nn.ReLU(inplace=True) def forward(self, X): Y = F.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) Y += X return F.relu(Y)
(2).ResNet模型:
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) def resnet_block(input_channels, num_channels, num_residuals, first_block=False): blk = [] for i in range(num_residuals): if i == 0 and not first_block: blk.append(Residual(input_channels, num_channels, use_1x1conv=True, strides=2)) else: blk.append(Residual(num_channels, num_channels)) return blk b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True)) b3 = nn.Sequential(*resnet_block(64, 128, 2)) b4 = nn.Sequential(*resnet_block(128, 256, 2)) b5 = nn.Sequential(*resnet_block(256, 512, 2))
(3)训练结果:
lr, num_epochs, batch_size = 0.05, 10, 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

七.本周猫狗大战:
本次猫狗大战采用了resnet50迁移学习的方式
定义网络:
resnet50 = models.resnet50(pretrained=True) print(resnet50) for param in resnet50.parameters(): param.requires_grad = False fc_inputs = 2048 resnet50.fc = nn.Sequential( nn.Linear(fc_inputs, 256), nn.ReLU(), nn.Dropout(0.4), nn.Linear(256, 2), nn.LogSoftmax(dim=1) ) resnet50 = resnet50.to("cuda") print(resnet50.fc)

(2)训练及测试:
criterion = nn.NLLLoss() lr = 0.001 optimizer_resnet50 = torch.optim.Adam(resnet50.parameters()) def train_model(model,dataloader,size,epochs=1,optimizer=None): model.train() for epoch in range(epochs): running_loss = 0.0 running_corrects = 0 count = 0 for inputs,classes in dataloader: inputs = inputs.to("cuda") classes = classes.to("cuda") outputs = model(inputs) loss = criterion(outputs,classes) optimizer = optimizer optimizer.zero_grad() loss.backward() optimizer.step() _,preds = torch.max(outputs.data,1) # statistics running_loss += loss.data.item() running_corrects += torch.sum(preds == classes.data) count += len(inputs) print('Training: No. ', count, ' process ... total: ', size) epoch_loss = running_loss / size epoch_acc = running_corrects.data.item() / size print('Loss: {:.4f} Acc: {:.4f}'.format( epoch_loss, epoch_acc)) # 模型训练 train_model(resnet50,train_iter,size=len(train_datasets), epochs=1, optimizer=optimizer_resnet50)

(3)提交测试:


浙公网安备 33010602011771号