批量归一化层

 批量归一化层BatchNorm

一.手写BatchNorm(用于理解)

import torch
from d2l import torch as d2l
from torch import nn
def batch_norm(X,gamma,beta,moving_mean,moving_val,eps,momentum):
    if not torch.is_grad_enabled():#推理模式(用全局均值、全局方差)
        X_hat=(X-moving_mean)/torch.sqrt(moving_val+eps)
    else:#训练模式
        assert len(X.shape) in(2,4)#因为输入只有可能是全连接层长度为2,或者是卷积层长度为4
        if len(X.shape)==2:#全连接层
            mean=X.mean(dim=0)
            val=((X-mean)**2).mean(dim=0)
        else:#卷积层
            mean=X.mean(dim=(0,2,3),keepdim=True)
            val=((X-mean)**2).mean(dim=(0,2,3),keepdim=True)
        #训练模式下用当前批次的均值、方差更新
        X_hat=(X-mean)/(val+eps)
        moving_mean=momentum*moving_mean+(1-momentum)*mean
        moving_val=momentum*moving_val+(1-momentum)*val
    Y=gamma*X_hat+beta#y=γx+β
    return Y,moving_mean.data,moving_val.data
class BatchNorm(nn.Module):
    def __init__(self,num_features,num_dims):
        super().__init__()
        if num_dims==2:#2维全连接层
            shape=(1,num_features)
        elif num_dims==4:
            shape=(1,num_features,1,1)
        #初始化γ,β,moving_mean,moving_val
        self.gamma=nn.Parameter(torch.ones(shape))#要迭代
        self.beta=nn.Parameter(torch.zeros(shape))#要迭代
        self.moving_mean=torch.zeros(shape)
        self.moving_val=torch.ones(shape)
    def forward(self,X):#前向函数
        if self.moving_mean.device!=X.device:#移动到gpu
            self.moving_mean=self.moving_mean.to(X.device)
            self.moving_val=self.moving_val.to(X.device)
        Y, self.moving_mean.data, self.moving_val.data=batch_norm(
            X,self.gamma,self.beta, self.moving_mean,self.moving_val,1e-5,0.9
        )
        return Y

net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    BatchNorm(6,num_dims=4),#6表示上一层输出通道数,4表示上一层是卷积层
    nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5),
    BatchNorm(16,num_dims=4),
    nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Flatten(),
    nn.Linear(16 * 5 * 5, 120),
    BatchNorm(120,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    BatchNorm(84,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(84, 10))
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
d2l.train_ch6(net,train_iter,test_iter,10,1,d2l.try_gpu())

二.调包实现BatchNorm(常用)

例如在LeNet上用BatchNorm:

完整代码
import torch
from torch import nn
from d2l import torch as d2l

net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.BatchNorm2d(6),
    nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5),
    nn.BatchNorm2d(16),
    nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Flatten(),
    nn.Linear(16 * 5 * 5, 120),
    nn.BatchNorm1d(120),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    nn.BatchNorm1d(84),
    nn.Sigmoid(),
    nn.Linear(84, 10))
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
d2l.train_ch6(net,train_iter,test_iter,10,1,d2l.try_gpu())

代码的分析:

在LeNet基础上增加批量归一化层(目的调大学习率,加快学习速率,准确率基本不变)

net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2),
    nn.BatchNorm2d(6),
    nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5),
    nn.BatchNorm2d(16),
    nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Flatten(),
    nn.Linear(16 * 5 * 5, 120),
    nn.BatchNorm1d(120),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    nn.BatchNorm1d(84),
    nn.Sigmoid(),
    nn.Linear(84, 10))

批量归一化层:

1.作用于卷积层后,这是用的是BatchNorm2d(上一次输出通道数)并且在激活函数前

2.用于全连接层Linear后,这里用的是BatchNorm1d(上一层输出通道数),还是在激活函数前

posted @ 2025-10-17 14:41  Annaprincess  阅读(9)  评论(0)    收藏  举报