批量归一化层
批量归一化层BatchNorm
一.手写BatchNorm(用于理解)
import torch
from d2l import torch as d2l
from torch import nn
def batch_norm(X,gamma,beta,moving_mean,moving_val,eps,momentum):
if not torch.is_grad_enabled():#推理模式(用全局均值、全局方差)
X_hat=(X-moving_mean)/torch.sqrt(moving_val+eps)
else:#训练模式
assert len(X.shape) in(2,4)#因为输入只有可能是全连接层长度为2,或者是卷积层长度为4
if len(X.shape)==2:#全连接层
mean=X.mean(dim=0)
val=((X-mean)**2).mean(dim=0)
else:#卷积层
mean=X.mean(dim=(0,2,3),keepdim=True)
val=((X-mean)**2).mean(dim=(0,2,3),keepdim=True)
#训练模式下用当前批次的均值、方差更新
X_hat=(X-mean)/(val+eps)
moving_mean=momentum*moving_mean+(1-momentum)*mean
moving_val=momentum*moving_val+(1-momentum)*val
Y=gamma*X_hat+beta#y=γx+β
return Y,moving_mean.data,moving_val.data
class BatchNorm(nn.Module):
def __init__(self,num_features,num_dims):
super().__init__()
if num_dims==2:#2维全连接层
shape=(1,num_features)
elif num_dims==4:
shape=(1,num_features,1,1)
#初始化γ,β,moving_mean,moving_val
self.gamma=nn.Parameter(torch.ones(shape))#要迭代
self.beta=nn.Parameter(torch.zeros(shape))#要迭代
self.moving_mean=torch.zeros(shape)
self.moving_val=torch.ones(shape)
def forward(self,X):#前向函数
if self.moving_mean.device!=X.device:#移动到gpu
self.moving_mean=self.moving_mean.to(X.device)
self.moving_val=self.moving_val.to(X.device)
Y, self.moving_mean.data, self.moving_val.data=batch_norm(
X,self.gamma,self.beta, self.moving_mean,self.moving_val,1e-5,0.9
)
return Y
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2),
BatchNorm(6,num_dims=4),#6表示上一层输出通道数,4表示上一层是卷积层
nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5),
BatchNorm(16,num_dims=4),
nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120),
BatchNorm(120,num_dims=2),
nn.Sigmoid(),
nn.Linear(120, 84),
BatchNorm(84,num_dims=2),
nn.Sigmoid(),
nn.Linear(84, 10))
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
d2l.train_ch6(net,train_iter,test_iter,10,1,d2l.try_gpu())
二.调包实现BatchNorm(常用)
例如在LeNet上用BatchNorm:
完整代码
import torch
from torch import nn
from d2l import torch as d2l
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2),
nn.BatchNorm2d(6),
nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5),
nn.BatchNorm2d(16),
nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120),
nn.BatchNorm1d(120),
nn.Sigmoid(),
nn.Linear(120, 84),
nn.BatchNorm1d(84),
nn.Sigmoid(),
nn.Linear(84, 10))
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
d2l.train_ch6(net,train_iter,test_iter,10,1,d2l.try_gpu())
代码的分析:
在LeNet基础上增加批量归一化层(目的调大学习率,加快学习速率,准确率基本不变)
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2),
nn.BatchNorm2d(6),
nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5),
nn.BatchNorm2d(16),
nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120),
nn.BatchNorm1d(120),
nn.Sigmoid(),
nn.Linear(120, 84),
nn.BatchNorm1d(84),
nn.Sigmoid(),
nn.Linear(84, 10))
批量归一化层:
1.作用于卷积层后,这是用的是BatchNorm2d(上一次输出通道数)并且在激活函数前
2.用于全连接层Linear后,这里用的是BatchNorm1d(上一层输出通道数),还是在激活函数前

浙公网安备 33010602011771号