cnn_layers.py

实现卷积神经网络的前向后传播的函数。

#-*- coding: utf-8 -*-
import numpy as np
from layers import *
from bn_layers import *


def conv_forward_naive(x, w, b, conv_param):
    """
    卷积前向传播。
    Input:
    - x: 四维图片数据(N, C, H, W)分别表示(数量,色道,高,宽)
    - w: 四维卷积核(F, C, HH, WW)分别表示(下层色道,上层色道,高,宽)
    - b: 偏置项(F,)
    - conv_param: 字典型参数表,其键值为:
        - 'stride':跳跃数据卷积的跨幅数量
        - 'pad':输入数据的零填充数量

    Returns 元组型:
    - out: 输出数据(N, F, H', W') ,其中 H' 和 W' 分别为:
        H' = 1 + (H + 2 * pad - HH) / stride
        W' = 1 + (W + 2 * pad - WW) / stride
    - cache: (x, w, b, conv_param)
    """
    out = None
    #############################################################################
    #                         任务: 实现卷积层的前向传播                                                                    #
    #                    提示: 你可以使用np.pad函数进行零填充                                                         #
    #############################################################################
    # 获取数据的各种数据量
    # 数量,色道,高,宽
    N, C, H, W = x.shape[0], x.shape[1], x.shape[2], x.shape[3]
    # 下层色道,高,宽
    F,HH,WW = w.shape[0],w.shape[2],w.shape[3]
    # 输入数据的零填充数量
    pad = conv_param['pad']
    # 跳跃数据进行卷积的跨幅数量
    stride = conv_param['stride']
    # 进行填充
    x_pad = np.pad(x, ((0,), (0,), (pad,), (pad,)), 'constant')
    
    # 计算循环次数
    Hhat = int(1 + (H + 2 * pad - HH) / stride)
    What= int(1 + (W + 2 * pad - WW) / stride)
    # 输出值
    out = np.zeros([N,F,Hhat,What])
    # 遍历所有数据的下层色道的高和宽
    for n in range(N):
        for f in range(F):
            for i in range(Hhat):
                for j in range(What):
                    xx =x_pad[n, :, i*stride:i*stride+HH, j*stride:j*stride+WW]
                    out[n,f,i,j] =np.sum(xx*w[f])+b[f]
    #############################################################################
    #                                                         结束编码                                                                            #
    #############################################################################
    cache = (x, w, b, conv_param)
    return out, cache


def conv_forward_fast(x, w, b, conv_param):
    '''
    卷积前向传播的快速版本

    Parameters
    ----------
    x : 四维图片数据(N, C, H, W)分别表示(数量,色道,高,宽)
    w : 四维卷积核(F, C, HH, WW)分别表示(下层色道,上层色道,高,宽)
    b : 偏置项(F,)
    conv_param : 字典型参数表,其键值为:
        - 'stride':跳跃数据卷积的跨幅数量
        - 'pad':输入数据的零填充数量

    Returns
    -------
    out : 输出数据(N, F, H', W') ,其中 H' 和 W' 分别为:
        H' = 1 + (H + 2 * pad - HH) / stride
        W' = 1 + (W + 2 * pad - WW) / stride
    cache : (x, w, b, conv_param)

    '''
    N, C, H, W = x.shape
    F, _, HH, WW = w.shape
    stride, pad = conv_param['stride'], conv_param['pad']
    assert (W + 2 * pad - WW) % stride == 0, '宽度异常'
    assert (H + 2 * pad - HH) % stride == 0, '高度异常'
    # 零填充
    p = pad
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), 
                                        mode='constant') 
    # 计算输出维度
    H += 2 * pad
    W += 2 * pad
    out_h = int((H - HH) / stride + 1)
    out_w = int((W - WW) / stride + 1)
    shape = (C, HH, WW, N, out_h, out_w)
    strides = (H * W, W, 1, C * H * W, stride * W, stride)
    strides = x.itemsize * np.array(strides)
    x_stride = np.lib.stride_tricks.as_strided(x_padded,
                                shape=shape, strides=strides)
    x_cols = np.ascontiguousarray(x_stride)
    x_cols.shape = (C * HH * WW, N * out_h * out_w)
    # 将所有卷积核重塑成一行
    res = w.reshape(F, -1).dot(x_cols) + b.reshape(-1, 1)
    # 重塑输出
    res.shape = (F, N, out_h, out_w)
    out = res.transpose(1, 0, 2, 3)
    out = np.ascontiguousarray(out)
    cache = (x, w, b, conv_param)
    return out, cache



def conv_backward_naive1(dout, cache):
    """
    卷积层反向传播显式循环版本

    Inputs:
    - dout:上层梯度.
    - cache: 前向传播时的缓存元组 (x, w, b, conv_param) 

    Returns 元组:
    - dx:    x梯度
    - dw:    w梯度
    - db:    b梯度
    """
    dx, dw, db = None, None, None
    #############################################################################
    #                        任务 :实现卷积层反向传播                                                             #
    #############################################################################
    x, w, b, conv_param = cache
    P = conv_param['pad']
    x_pad = np.pad(x,((0,),(0,),(P,),(P,)),'constant')
    N, C, H, W = x.shape
    F, C, HH, WW = w.shape
    N, F, Hh, Hw = dout.shape
    S = conv_param['stride']
    dw = np.zeros((F, C, HH, WW))
    for fprime in range(F):
        for cprime in range(C):
            for i in range(HH):
                for j in range(WW):
                    sub_xpad =x_pad[:,cprime,i:i+Hh*S:S,j:j+Hw*S:S]
                    dw[fprime,cprime,i,j] = np.sum(
                            dout[:,fprime,:,:]*sub_xpad)

    
    db = np.zeros((F))
    for fprime in range(F):
                db[fprime] = np.sum(dout[:,fprime,:,:])
    dx = np.zeros((N, C, H, W))
    
    for nprime in range(N):
        for i in range(H):
            for j in range(W):
                for f in range(F):
                    for k in range(Hh):
                        for l in range(Hw):
                            mask1 = np.zeros_like(w[f,:,:,:])
                            mask2 = np.zeros_like(w[f,:,:,:])
                            if (i+P-k*S)<HH and (i+P-k*S)>= 0:
                                    mask1[:,i+P-k*S,:] = 1.0
                            if (j+P-l* S) < WW and (j+P-l*S)>= 0:
                                    mask2[:,:,j+P-l*S] = 1.0
                            w_masked=np.sum(w[f,:,:,:]*mask1*mask2,axis=(1,2))
                            dx[nprime,:,i,j] +=dout[nprime,f,k,l]*w_masked
    #############################################################################
    #                            结束编码                                                                         #
    #############################################################################
    return dx, dw, db


def conv_backward_naive(dout, cache):
    """
    卷积层反向传播

    Inputs:
    - dout:上层梯度.
    - cache: 前向传播时的缓存元组 (x, w, b, conv_param) 

    Returns 元组:
    - dx:    x梯度
    - dw:    w梯度
    - db:    b梯度
    """
    dx, dw, db = None, None, None
    #############################################################################
    #                        任务 :实现卷积层反向传播                                                             #
    #############################################################################
    x, w, b, conv_param = cache
    # 初始化参数
    N, C, H, W = x.shape
    F, _, HH, WW = w.shape
    stride, pad = conv_param['stride'], conv_param['pad']
    # 计算循环次数
    H_out = int(1+(H+2*pad-HH)/stride)
    W_out = int(1+(W+2*pad-WW)/stride)
    # 进行0填充
    x_pad = np.pad(x,((0,), (0,), (pad,), (pad,)), 
                   mode='constant', constant_values=0)
    # 计算梯度
    dx = np.zeros_like(x)
    dx_pad = np.zeros_like(x_pad)
    dw = np.zeros_like(w)
    db = np.zeros_like(b)
    # 进行求解
    db = np.sum(dout, axis=(0, 2, 3))
    x_pad = np.pad(x,((0,), (0,), (pad,), (pad,)), 
                   mode='constant', constant_values=0)
    for i in range(H_out):
        for j in range(W_out):
            x_pad_masked = x_pad[:, :, i*stride:i*stride+HH, 
                                 j*stride:j*stride+WW]
            # 计算dw
            for k in range(F):
                dw[k, :, :, :] += np.sum(x_pad_masked*(dout[:, k, i, j])[:,
                            None, None, None], axis=0)
            
            # 计算dx_pad
            for n in range(N):
                dx_pad[n, :, i*stride:i*stride+HH, j*stride:j*stride+WW] += \
                    np.sum((w[:, :, :, :]*(dout[n, :, i, j])[:, None, None, None]),
                           axis=0)
                    
    dx = dx_pad[:, :, pad:-pad, pad:-pad]
    #############################################################################
    #                              结束编码                                                                            #
    #############################################################################
    return dx, dw, db


def max_pool_forward_naive(x, pool_param):
    """
    最大池化前向传播

    Inputs:
    - x: 数据 (N, C, H, W)
    - pool_param: 键值:
        - 'pool_height': 池化高
        - 'pool_width': 池化宽
        - 'stride': 步幅

    Returns 元组型:
    - out: 输出数据
    - cache: (x, pool_param)
    """
    out = None
    #############################################################################
    #                        任务: 实现最大池化操作的前向传播                                                #
    #############################################################################
    # 初始化参数
    N, C, H, W = x.shape
    HH = pool_param['pool_height']
    WW = pool_param['pool_width']
    stride = pool_param['stride']
    
    # 计算循环次数
    H_out = int((H-HH)/stride+1) 
    W_out = int((W-WW)/stride+1)
    out = np.zeros((N, C, H_out, W_out))
    for i in range(H_out):
        for j in range(W_out):
            # 先找到对应区域
            x_masked = x[:, :, i*stride:i*stride+HH, j*stride:j*stride+WW]
            # 选择其中最大的值
            out[:, :, i, j] = np.max(x_masked, axis=(2,3))
    #############################################################################
    #                            结束编码                                                                            #
    #############################################################################
    cache = (x, pool_param)
    return out, cache


def max_pool_forward_fast(x, pool_param):
    '''
    最大池化前向传播的快速版本

    Parameters
    ----------
    x : 四维图片数据(N, C, H, W)分别表示(数量,色道,高,宽)
    pool_param : 字典型参数表,其键值为:
        - 'pool_height': 池化高
        - 'pool_width': 池化宽
        - 'stride': 步幅

    Returns
    -------
    out : 输出数据
    cache : (x, x_reshaped, out)
    '''
    # 初始化参数
    N, C, H, W = x.shape
    pool_height = pool_param['pool_height']
    pool_width = pool_param['pool_width']
    stride = pool_param['stride']
    
    assert pool_height == pool_width == stride, 'Invalid pool params'
    assert H % pool_height == 0
    assert W % pool_height == 0
    
    x_reshaped = x.reshape(N, C, int(H / pool_height), pool_height,
                                                int(W / pool_width), pool_width)
    out = x_reshaped.max(axis=3).max(axis=4)

    cache = (x, x_reshaped, out)
    return out, cache


def max_pool_backward_naive(dout, cache):
    """
    最大池化反向传播.

    Inputs:
    - dout: 上层梯度
    - cache: 缓存 (x, pool_param)
    Returns:
    - dx:    x梯度
    """
    dx = None
    #############################################################################
    #                        任务:实现最大池化反向传播                                                                         #
    #############################################################################
    x, pool_param = cache
    N, C, H, W = x.shape
    HH = pool_param['pool_height']
    WW = pool_param['pool_width']
    stride = pool_param['stride']
    H_out = int((H-HH)/stride+1)
    W_out = int((W-WW)/stride+1)
    dx = np.zeros_like(x)
    for i in range(H_out):
        for j in range(W_out):
            x_masked = x[:, :, i*stride:i*stride+HH, j*stride:j*stride+WW]
            max_x_masked = np.max(x_masked, axis=(2, 3))
            temp_binary_mask = (x_masked == (max_x_masked)[:, :, None, None])
            dx[:, :, i*stride:i*stride+HH, j*stride:j*stride+WW] += \
                temp_binary_mask*(dout[:, :, i, j])[:, :, None, None]
    #############################################################################
    #                          结束编码                                                                            #
    #############################################################################
    return dx


def max_pool_backward_fast(dout, cache):
    x, x_reshaped, out = cache
    dx_reshaped = np.zeros_like(x_reshaped)
    out_newaxis = out[:, :, :, np.newaxis, :, np.newaxis]
    mask = (x_reshaped == out_newaxis)
    dout_newaxis = dout[:, :, :, np.newaxis, :, np.newaxis]
    dout_broadcast, _ = np.broadcast_arrays(dout_newaxis, dx_reshaped)
    dx_reshaped[mask] = dout_broadcast[mask]
    dx_reshaped /= np.sum(mask, axis=(3, 5), keepdims=True)
    dx = dx_reshaped.reshape(x.shape)
    return dx


def spatial_batchnorm_forward(x, gamma, beta, bn_param):
    """
    空间批量归一化前向传播
    
    Inputs:
    - x: 数据 (N, C, H, W)
    - gamma: 缩放因子 (C,)
    - beta: 偏移因子 (C,)
    - bn_param: 参数字典:
        - mode: 'train' or 'test';
        - eps: 数值稳定常数
        - momentum: 运行平均值衰减因子
        - running_mean: 形状为(D,) 的运行均值
        - running_var :形状为 (D,) 的运行方差
        
    Returns 元组:
    - out:输出 (N, C, H, W)
    - cache: 用于反向传播的缓存
    """
    out, cache = None, None
    #############################################################################
    #                             任务:实现空间BN算法前向传播                                                                #
    #            提示:你只需要重塑数据,调用 batchnorm_forward函数即可                             #
    #############################################################################
    N, C, H, W = x.shape
    temp_output, cache = batchnorm_forward(
        x.transpose(0, 3, 2, 1).reshape(N*H*W, C), gamma, beta, bn_param)
    out = temp_output.reshape(N, W, H, C).transpose(0, 3, 2, 1)
    #############################################################################
    #                          结束编码                                                                            #
    #############################################################################

    return out, cache


def spatial_batchnorm_backward(dout, cache):
    """
        空间批量归一化反向传播
    
    Inputs:
    - dout: 上层梯度 (N, C, H, W)
    - cache: 前向传播缓存
    
    Returns 元组:
    - dx:输入梯度 (N, C, H, W)
    - dgamma: gamma梯度 (C,)
    - dbeta: beta梯度 (C,)
    """
    dx, dgamma, dbeta = None, None, None
    #############################################################################
    #                        任务:实现空间BN算法反向传播                                                                     #
    #                        提示:你只需要重塑数据调用batchnorm_backward_alt函数即可             #                            
    #############################################################################    
    N, C, H, W = dout.shape
    dx_temp, dgamma, dbeta = batchnorm_backward_alt(
        dout.transpose(0, 3 , 2, 1).reshape((N*H*W, C)), cache)
    dx = dx_temp.reshape(N, W, H, C).transpose(0, 3, 2, 1)
    #############################################################################
    #                       结束编码                                                                                     #
    #############################################################################
    return dx, dgamma, dbeta
    

def conv_relu_forward(x, w, b, conv_param):
    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
    out, relu_cache = relu_forward(a)
    cache = (conv_cache, relu_cache)
    return out, cache


def conv_relu_backward(dout, cache):
    conv_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dw, db = conv_backward_naive(da, conv_cache)
    return dx, dw, db


def conv_relu_pool_forward(x, w, b, conv_param, pool_param):
    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
    s, relu_cache = relu_forward(a)
    out, pool_cache = max_pool_forward_fast(s, pool_param)
    cache = (conv_cache, relu_cache, pool_cache)
    return out, cache


def conv_relu_pool_backward(dout, cache):
    '''
    完整卷积层的反向传播

    Parameters
    ----------
    dout : 上层梯度 (N, C, H, W)
    cache : (conv_cache, relu_cache, pool_cache)

    Returns
    -------
    dx : x的梯度
    dw : w的梯度
    db : b的梯度
    '''
    conv_cache, relu_cache, pool_cache = cache
    ds = max_pool_backward_fast(dout, pool_cache)
    da = relu_backward(ds, relu_cache)
    dx, dw, db = conv_backward_naive(da, conv_cache)
    return dx, dw, db

layers.py

之前已经写好的前向传播与后向传播代码以及softmax的损失函数。

#-*- coding: utf-8 -*-
import numpy as np

def affine_forward(x, w, b):
    """
    计算神经网络当前层的前馈传播。该方法计算在全连接情况下的得分函数
    注:如果不理解affine仿射变换,简单的理解为在全连接情况下的得分函数即可

    输入数据x的形状为(N, d_1, ..., d_k),其中N表示数据量,(d_1, ..., d_k)表示
    每一通道的数据维度。如果是图片数据就为(长,宽,色道),数据的总维度就为
    D = d_1 * ... * d_k,因此我们需要数据整合成完整的(N,D)形式再进行仿射变换。
    
    Inputs:
    - x: 输入数据,其形状为(N, d_1, ..., d_k)的numpy array
    - w: 权重矩阵,其形状为(D,M)的numpy array,D表示输入数据维度,M表示输出数据维度
             可以将D看成输入的神经元个数,M看成输出神经元个数
    - b: 偏置向量,其形状为(M,)的numpy array
    
    Returns 元组:
    - out: 形状为(N, M)的输出结果
    - cache: 将输入进行缓存(x, w, b)
    """
    out = None
    # 任务: 实现全连接前向传播
    # 注:首先你需要将输入数据重塑成行。  
    N=x.shape[0]
    x_new=x.reshape(N,-1)#将x重塑成2维向量
    out=np.dot(x_new,w)+b
    cache = (x, w, b)
    return out, cache


def affine_backward(dout, cache):
    """
 计算仿射层的反向传播.

    Inputs:
    - dout: 形状为(N, M)的上层梯度
    - cache: 元组:
        - x: (N, d_1, ... d_k)的输入数据
        - w: 形状为(D, M)的权重矩阵

    Returns 元组:
    - dx: 输入数据x的梯度,其形状为(N, d1, ..., d_k)
    - dw: 权重矩阵w的梯度,其形状为(D,M)
    - db: 偏置项b的梯度,其形状为(M,)
    """
    x, w, b = cache
    dx, dw, db = None, None, None
    # 注意:你需要将x重塑成(N,D)后才能计算各梯度,                                            #
    # 完梯度后你需要将dx的形状与x重塑成一样     
    db = np.sum(dout,axis=0)
    xx= x.reshape(x.shape[0],-1)
    dw = np.dot(xx.T,dout)
    dx = np.dot(dout,w.T)
    dx=np.reshape(dx,x.shape)
    return dx, dw, db


def relu_forward(x):
    """
    计算tified linear units (ReLUs)激活函数的前向传播,并保存相应缓存

    Input:
    - x: 输入数据

    Returns 元组:
    - out: 和输入数据x形状相同
    - cache: x
    """
    out = None
    # 实现ReLU 的前向传播.                                                                        #
    out =np.maximum(0,x)
    cache = x
    return out, cache


def relu_backward(dout, cache):
    """
    计算 rectified linear units (ReLUs)激活函数的反向传播.

    Input:
    - dout: 上层误差梯度
    - cache: 输入 x,其形状应该和dout相同

    Returns:
    - dx: x的梯度
    """
    dx, x = None, cache
    # 实现 ReLU 反向传播.    
    dx=dout
    dx[x<=0]=0
    return dx

def affine_relu_forward(x, w, b):
    """
     ReLU神经元前向传播

    Inputs:
    - x: 输入到 affine层的数据
    - w, b:    affine层的权重矩阵和偏置向量

    Returns 元组:
    - out: Output from the ReLU的输出结果
    - cache: 前向传播的缓存
    """
    # 你需要调用affine_forward以及relu_forward函数,并将各自的缓存保存在cache中                                                                    #
    a, fc_cache = affine_forward(x, w, b)
    out, relu_cache = relu_forward(a)
    cache = (fc_cache, relu_cache)
    return out, cache


def affine_relu_backward(dout, cache):
    """
     ReLU神经元的反向传播
     
    Input:
    - dout: 上层误差梯度
    - cache: affine缓存,以及relu缓存

    Returns:
    - dx: 输入数据x的梯度
    - dw: 权重矩阵w的梯度
    - db: 偏置向量b的梯度
    """
    fc_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dw, db = affine_backward(da, fc_cache)
    return dx, dw, db




def softmax_loss(x, y):

    probs = np.exp(x - np.max(x, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    N = x.shape[0]
    loss = -np.sum(np.log(probs[np.arange(N), y])) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N

    return loss, dx

trainer.py

解耦训练器的实现

#-*- coding: utf-8 -*-
import numpy as np

import updater


class Trainer(object):
    """
    使用形式:
    
    data = {
        'X_train': # 训练数据
        'y_train': # 训练类标
        'X_val': # 验证数据
        'X_train': # 验证类标
    }
    model = MyAwesomeModel(hidden_size=100, reg=10)
    Trainer = Trainer(model, data,
                                    update_rule='sgd',
                                    updater_config={
                                        'learning_rate': 1e-3,
                                    },
                                    lr_decay=0.95,
                                    num_epochs=10, batch_size=100,
                                    print_every=100)
    Trainer.train()
    """

    def __init__(self, model, data, **kwargs):
        """
        构造一个新的Trainer实例
        必须参数:
        - model: 网络模型
        - data: 数据字典,其中:
            'X_train':    形状为(N_train, d_1, ..., d_k)训练数据
            'X_val':    形状为(N_val, d_1, ..., d_k) 验证数据
            'y_train':    形状为(N_train,) 训练数据类标
            'y_val':    形状为(N_val,) 验证数据类标
            
        可选参数:
        - update_rule: 更新规则,其存放在updater.py文件中,默认选项为'sgd'。
        - updater_config: 字典类型的,更新规则所对应的超参数配置,同见updater.py文件。
        - lr_decay: 学习率衰减系数。
        - batch_size: 批量数据大小
        - num_epochs: 训练周期
        - print_every: 整数型; 每迭代多少次进行打印一次中间结果
        - verbose: 布尔型; 是否在训练期间打印中间结果
        """
        self.model = model
        self.X_train = data['X_train']
        self.y_train = data['y_train']
        self.X_val = data['X_val']
        self.y_val = data['y_val']
        
        # 弹出可选参数,进行相关配置
        self.update_rule = kwargs.pop('update_rule', 'sgd')
        self.updater_config = kwargs.pop('updater_config', {})
        self.lr_decay = kwargs.pop('lr_decay', 1.0)
        self.batch_size = kwargs.pop('batch_size', 100)
        self.num_epochs = kwargs.pop('num_epochs', 10)

        self.print_every = kwargs.pop('print_every', 10)
        self.verbose = kwargs.pop('verbose', True)

        # 若可选参数错误,抛出异常
        if len(kwargs) > 0:
            extra = ', '.join('"%s"' % k for k in kwargs.keys())
            raise ValueError('Unrecognized arguments %s' % extra)


        #确认updater中含有更新规则
        if not hasattr(updater, self.update_rule):
            raise ValueError('Invalid update_rule "%s"' % self.update_rule)
        self.update_rule = getattr(updater, self.update_rule)

        # 初始化相关变量
        self.epoch = 0
        self.best_val_acc = 0
        self.best_params = {}
        self.loss_history = []
        self.train_acc_history = []
        self.val_acc_history = []

        # 对updater_config中的参数进行深拷贝
        self.updater_configs = {}
        for p in self.model.params:
            d = {k: v for k, v in self.updater_config.items()}
            self.updater_configs[p] = d


    def _step(self):
        """
        执行单步梯度更新
        """
        # 采样批量数据
        num_train = self.X_train.shape[0]
        batch_mask = np.random.choice(num_train, self.batch_size)
        X_batch = self.X_train[batch_mask]
        y_batch = self.y_train[batch_mask]

        # 计算损失及梯度
        loss, grads = self.model.loss(X_batch, y_batch)
        self.loss_history.append(loss)

        # 更新参数
        for p, w in self.model.params.items():
            dw = grads[p]
            config = self.updater_configs[p]
            next_w, next_config = self.update_rule(w, dw, config)
            self.model.params[p] = next_w
            self.updater_configs[p] = next_config


    def check_accuracy(self, X, y, num_samples=None, batch_size=100):
        """
     根据提供的数据检验精度,若数据集过大,可进行采样测试。
        
        Inputs:
        - X: 形状为(N, d_1, ..., d_k)的数据
        - y: 形状为 (N,)的数据类标
        - num_samples: 采样次数
        - batch_size:批量数据大小
            
        Returns:
        - acc: 测试数据正确率
        """
        
        # 对数据进行采样
        N = X.shape[0]
        if num_samples is not None and N > num_samples:
            mask = np.random.choice(N, num_samples)
            N = num_samples
            X = X[mask]
            y = y[mask]

        # 计算精度
        num_batches = int(N / batch_size)
        if N % batch_size != 0:
          num_batches += 1
        y_pred = []
        for i in range(num_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            scores = self.model.loss(X[start:end])
            y_pred.append(np.argmax(scores, axis=1))
        y_pred = np.hstack(y_pred)
        acc = np.mean(y_pred == y)

        return acc


    def train(self):
        """
        根据配置训练模型
        """
        num_train = self.X_train.shape[0]
        iterations_per_epoch = max(num_train / self.batch_size, 1)
        num_iterations = int(self.num_epochs * iterations_per_epoch)
    
        for t in range(num_iterations):
            self._step()
    
            # 打印损失值
            if self.verbose and t % self.print_every == 0:
                print('(迭代 %d / %d) 损失值: %f' % (
                             t + 1, num_iterations, self.loss_history[-1]))

            # 更新学习率
            epoch_end = (t + 1) % iterations_per_epoch == 0
            if epoch_end:
                self.epoch += 1
                for k in self.updater_configs:
                    self.updater_configs[k]['learning_rate'] *= self.lr_decay


            #在训练的开始,末尾,每一轮训练周期检验精度
            first_it = (t == 0)
            last_it = (t == num_iterations + 1)
            if first_it or last_it or epoch_end:
                train_acc = self.check_accuracy(self.X_train, self.y_train,
                                                                                num_samples=1000)
                val_acc = self.check_accuracy(self.X_val, self.y_val)
                self.train_acc_history.append(train_acc)
                self.val_acc_history.append(val_acc)

                if self.verbose:
                    print('(周期 %d / %d) 训练精度: %f; 验证精度: %f' % (
                                 self.epoch, self.num_epochs, train_acc, val_acc))

                # 记录最佳模型
                if val_acc > self.best_val_acc:
                    self.best_val_acc = val_acc
                    self.best_params = {}
                    for k, v in self.model.params.items():
                        self.best_params[k] = v.copy()
    
            # 训练结束后返回最佳模型
            self.model.params = self.best_params


updater.py

解耦更新器,主要负责更新神经网络的权重,其传入参数有神经网络的权重w ww、当前权重的梯度d w dwdw及相应的更新配置。

#-*- coding: utf-8 -*-
import numpy as np

"""
频繁使用在训练神经网络中的一阶梯度更新规则。每次更新接受当前的权重,
对应的梯度,以及相关配置进行权重更新。
def update(w, dw, config=None):
Inputs:
    - w:当前权重.
    - dw: 和权重形状相同的梯度.
    - config: 字典型超参数配置,比如学习率,动量值等。如果更新规则需要用到缓存,
        在配置中需要保存相应的缓存。

Returns:
    - next_w: 更新后的权重.
    - config: 更新规则相应的配置.
"""


def sgd(w, dw, config=None):
    """
    随机梯度下降更新规则.

    config 格式:
    - learning_rate: 学习率.
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-2)

    w -= config['learning_rate'] * dw
    return w, config



def sgd_momentum(w, dw, config=None):
    """
    动量随机梯度下降更新规则

    config 格式:
    - learning_rate: 学习率.
    - momentum: [0,1]的动量,0表示不使用动量,退化为SGD
    - velocity: 和w,dw同形的速度
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-2)
    config.setdefault('momentum', 0.9)
    v = config.setdefault('velocity', np.zeros_like(w))
    
    next_w = None
    v =config['momentum']*config['velocity'] - config['learning_rate'] * dw
    next_w = w + v
    config['velocity'] = v

    return next_w, config



def rmsprop(w, dw, config=None):
    """
    RMSProp 更新规则

    config 格式:
    - learning_rate: 学习率.
    - decay_rate:用于衰减历史梯度值的衰减率,取值为[0,1]
    - epsilon: 避免除零异常的小数.
    - cache:历史梯度缓存.
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-2)
    config.setdefault('decay_rate', 0.99)
    config.setdefault('epsilon', 1e-8)
    config.setdefault('cache', np.zeros_like(w))

    next_w = None
    config['cache'] = config['decay_rate'] * config['cache'] + (1 - config['decay_rate']) * dw**2
    next_w = w - config['learning_rate'] * dw /(np.sqrt(config['cache'] + config['epsilon']))

    return next_w, config


def adam(w, dw, config=None):
    """
    使用 Adam更新规则 ,融合了“热身”更新

    config 格式:
    - learning_rate: 学习率.
    - beta1: 动量衰减率.
    - beta2: 学习步长衰减率.
    - epsilon: 防除0小数.
    - m: 梯度.
    - v: 梯度平方.
    - t: 迭代次数.
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-3)
    config.setdefault('beta1', 0.9)
    config.setdefault('beta2', 0.999)
    config.setdefault('epsilon', 1e-8)
    config.setdefault('m', np.zeros_like(w))
    config.setdefault('v', np.zeros_like(w))
    config.setdefault('t', 0)
    
    next_w = None
    # 将更新后的权重存放在next_w中,记得将m,v,t存放在相应的config中 
    config['t'] += 1
    beta1 = config['beta1']
    beta2 = config['beta2']
    epsilon = config['epsilon']
    learning_rate = config['learning_rate']
    config['m'] = beta1 * config['m'] + (1-beta1) * dw
    config['v'] = beta2 * config['v'] + (1-beta2) * dw**2
    mb = config['m']/(1 - beta1**config['t'])
    vb = config['v']/(1 - beta2**config['t'])
    next_w = w - learning_rate * mb / (np.sqrt(vb)+epsilon)
    return next_w, config


bn_layers.py

实现BN算法的前向传播、反向传播。

#-*- coding: utf-8 -*-
import numpy as np
from layers import *
from dropout_layers import *

def batchnorm_forward(x, gamma, beta, bn_param):
    """

    使用使用类似动量衰减的运行时平均,计算总体均值与方差 例如:
    
    running_mean = momentum * running_mean + (1 - momentum) * sample_mean
    running_var = momentum * running_var + (1 - momentum) * sample_var
    Input:
    - x: 数据(N, D)
    - gamma: 缩放参数 (D,)
    - beta: 平移参数 (D,)
    - bn_param: 字典型,使用下列键值:
        - mode: 'train' 或'test'; 
        - eps: 保证数值稳定
        - momentum: 运行时平均衰减因子 
        - running_mean: 形状为(D,)的运行时均值
        - running_var : 形状为 (D,)的运行时方差

    Returns 元组:
    - out: 输出(N, D)
    - cache: 用于反向传播的缓存
    """
    mode = bn_param['mode']
    eps = bn_param.get('eps', 1e-5)
    momentum = bn_param.get('momentum', 0.9)

    N, D = x.shape
    running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))
    running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))

    out, cache = None, None
    if mode == 'train':
        # Forward pass
        # Step 1 - shape of mu (D,)
        mu = 1 / float(N) * np.sum(x, axis=0)
        # Step 2 - shape of var (N,D)
        xmu = x - mu
        # Step 3 - shape of carre (N,D)
        carre = xmu**2
        # Step 4 - shape of var (D,)
        var = 1 / float(N) * np.sum(carre, axis=0)
        # Step 5 - Shape sqrtvar (D,)
        sqrtvar = np.sqrt(var + eps)
        # Step 6 - Shape invvar (D,)
        invvar = 1. / sqrtvar
        # Step 7 - Shape va2 (N,D)
        va2 = xmu * invvar
        # Step 8 - Shape va3 (N,D)
        va3 = gamma * va2
        # Step 9 - Shape out (N,D)
        out = va3 + beta
        running_mean = momentum * running_mean + (1.0 - momentum) * mu
        running_var = momentum * running_var + (1.0 - momentum) * var
        cache = (mu, xmu, carre, var, sqrtvar, invvar,va2, va3, gamma, beta, x, bn_param)
    elif mode == 'test':
        # 使用运行时均值与方差归一化数据
        mu = running_mean
        var = running_var
        xhat = (x - mu) / np.sqrt(var + eps)
        # 使用gamma和beta参数缩放,平移数据。
        out = gamma * xhat + beta
        cache = (mu, var, gamma, beta, bn_param)
    else:
        raise ValueError('无法识别的BN模式: "%s"' % mode)

    # 更新运行时均值,方差
    bn_param['running_mean'] = running_mean
    bn_param['running_var'] = running_var

    return out, cache


def batchnorm_backward(dout, cache):
    """
    BN反向传播 
    Inputs:
    - dout: 上层梯度 (N, D)
    - cache: 前向传播时的缓存.
    
    Returns 元组:
    - dx: 数据梯度 (N, D)
    - dgamma: gamma梯度 (D,)
    - dbeta: beta梯度 (D,)
    """
    dx, dgamma, dbeta = None, None, None
    
    mu, xmu, carre, var, sqrtvar, invvar, va2, va3, gamma, beta, x, bn_param = cache
    eps = bn_param.get('eps', 1e-5)
    N, D = dout.shape
    # Backprop Step 9
    dva3 = dout
    dbeta = np.sum(dout, axis=0)
    # Backprop step 8
    dva2 = gamma * dva3
    dgamma = np.sum(va2 * dva3, axis=0)
    # Backprop step 7
    dxmu = invvar * dva2
    dinvvar = np.sum(xmu * dva2, axis=0)
    # Backprop step 6
    dsqrtvar = -1. / (sqrtvar**2) * dinvvar
    # Backprop step 5
    dvar = 0.5 * (var + eps)**(-0.5) * dsqrtvar
    # Backprop step 4
    dcarre = 1 / float(N) * np.ones((carre.shape)) * dvar
    # Backprop step 3
    dxmu += 2 * xmu * dcarre
    # Backprop step 2
    dx = dxmu
    dmu = - np.sum(dxmu, axis=0)
    # Basckprop step 1
    dx += 1 / float(N) * np.ones((dxmu.shape)) * dmu
    
    return dx, dgamma, dbeta


def batchnorm_backward_alt(dout, cache):
    """
    可选的BN反向传播
    """
    dx, dgamma, dbeta = None, None, None
    mu, xmu, carre, var, sqrtvar, invvar, va2, va3, gamma, beta, x, bn_param = cache
    eps = bn_param.get('eps', 1e-5)
    N, D = dout.shape
    dbeta = np.sum(dout, axis=0)
    dgamma = np.sum((x - mu) * (var + eps)**(-1. / 2.) * dout, axis=0)
    dx = (1./N) * gamma * (var + eps)**(-1./2.)*(N*dout-np.sum(
                        dout, axis=0)-(x-mu)*(var+eps)**(-1.0)*np.sum(dout*(x-mu),axis=0))
 
    return dx, dgamma, dbeta


def affine_bn_relu_forward(x,w,b,gamma, beta,bn_param):
    x_affine,cache_affine= affine_forward(x,w,b)
    x_bn,cache_bn = batchnorm_forward(x_affine,gamma, beta,bn_param)
    out,cache_relu = relu_forward(x_bn)
    cache = (cache_affine,cache_bn,cache_relu)
    return out,cache


def affine_bn_relu_backward(dout,cache):
    cache_affine,cache_bn,cache_relu = cache
    drelu = relu_backward(dout,cache_relu)
    dbn,dgamma, dbeta= batchnorm_backward_alt(drelu,cache_bn)
    dx,dw,db = affine_backward(dbn,cache_affine)
    return dx,dw,db,dgamma,dbeta


dropout_layers.py

包含了Dropout前向传播以及反向传播,组合Dropout传播层。

#-*- coding: utf-8 -*-
import numpy as np
from layers import *


def dropout_forward(x, dropout_param):
    """
    执行dropout前向传播
    Inputs:
    - x: 输入数据
    - dropout_param: 字典类型,使用下列键值:
        - p: dropout参数。每个神经元的激活概率p
        - mode: 'test'或'train'. 训练模式使用dropout;测试模式仅仅返回输入值。
        - seed: 随机数生成种子. 

    Outputs:
    - out: 和输入数据相同形状
    - cache:元组(dropout_param, mask). 
                  训练模式时,掩码mask用于激活该层神经元,测试模式时不使用
    """
    p, mode = dropout_param['p'], dropout_param['mode']
    if 'seed' in dropout_param:
        np.random.seed(dropout_param['seed'])

    mask = None
    out = None

    if mode == 'train':
        mask = (np.random.rand(*x.shape) < p)/p
        out =x*mask
    elif mode == 'test':
        out = x

    cache = (dropout_param, mask)
    out = out.astype(x.dtype, copy=False)

    return out, cache


def dropout_backward(dout, cache):
    """
    dropout反向传播

    Inputs:
    - dout: 上层梯度
    - cache: dropout_forward中的缓存(dropout_param, mask)。
    """
    dropout_param, mask = cache
    mode = dropout_param['mode']
    
    dx = None
    if mode == 'train':
        dx =dout*mask
    elif mode == 'test':
        dx = dout
    return dx

def affine_relu_dropout_forward(x,w,b,dropout_param):
    """
    组合affine_relu_dropout前向传播
    Inputs:
    - x: 输入数据
    - w: 权重参数
    - b: 偏置项
    - dropout_param: 字典类型,使用下列键值:
        - p: dropout参数。每个神经元的激活概率p
        - mode: 'test'或'train'. 训练模式使用dropout;测试模式仅仅返回输入值。
        - seed: 随机数生成种子. 

    Outputs:
    - out: 和输入数据相同形状
    - cache:缓存包含(cache_affine,cache_relu,cache_dropout)
    """ 
    out_dropout = None
    cache =None
    out_affine, cache_affine = affine_forward(x,w,b)
    out_relu,cache_relu =relu_forward(out_affine)
    out_dropout,cache_dropout =dropout_forward(out_relu,dropout_param)
    cache = (cache_affine,cache_relu,cache_dropout)
    return out_dropout,cache

def affine_relu_dropout_backward(dout,cache):
    """
     affine_relu_dropout神经元的反向传播
     
    Input:
    - dout: 上层误差梯度
    - cache: 缓存(cache_affine,cache_relu,cache_dropout)

    Returns:
    - dx: 输入数据x的梯度
    - dw: 权重矩阵w的梯度
    - db: 偏置向量b的梯度
    """    
    cache_affine,cache_relu,cache_dropout = cache
    dx,dw,db=None,None,None
    ddropout = dropout_backward(dout,cache_dropout)
    drelu = relu_backward(ddropout,cache_relu)
    dx,dw,db = affine_backward(drelu,cache_affine)
    return dx,dw,db

cnn.py

接下来我们实现简单的浅层卷积网络,该网络由一层卷积层,两层全连接层组成:输入 - conv - relu - 2x2 max pool - affine - relu - affine - softmax。

#-*- coding: utf-8 -*-
import sys, os
sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))

import numpy as np
from cnn_layers import *

class ThreeLayerConvNet(object):
    """ 
    conv - relu - 2x2 max pool - affine - relu - affine - softmax
    """
    
    def __init__(self, input_dim=(3, 32, 32), num_filters=32, filter_size=7,
                             hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0,):
        """
        初始化网络.
        
        Inputs:
        - input_dim: 输入数据形状 (C, H, W)
        - num_filters: 卷积核个数
        - filter_size: 卷积核尺寸
        - hidden_dim: 全连接层隐藏层个数
        - num_classes: 分类个数
        - weight_scale: 权重规模(标准差)
        - reg:权重衰减因子
        """
        self.params = {}
        self.reg = reg
        ############################################################################
        #                            任务:初始化权重参数                             #
        # 'W1'为卷积层参数,形状为(num_filters,C,filter_size,filter_size)             #
        # 'W2'为卷积层到全连接层参数,形状为((H/2)*(W/2)*num_filters, hidden_dim)       #
        #         'W3'隐藏层到全连接层参数                                            #
        ############################################################################
        C, H, W = input_dim
        self.params['W1'] = weight_scale*np.random.randn(num_filters, C, 
                    filter_size, filter_size)
        self.params['b1'] = np.zeros(num_filters)
        self.params['W2'] = weight_scale*np.random.randn(int((H/2)*(W/2)*num_filters), 
                    hidden_dim)
        self.params['b2'] = np.zeros(hidden_dim)
        self.params['W3'] = weight_scale*np.random.randn(hidden_dim, num_classes)
        self.params['b3'] = np.zeros(num_classes)
        ############################################################################
        #                             结束编码                                      #
        ############################################################################
         
 
    def loss(self, X, y=None):
        
        # 初始化参数
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']
        # 使用卷积层
        filter_size = W1.shape[2]
        # 设置卷积层和池化层所需要的参数
        conv_param = {'stride': 1, 'pad': int((filter_size - 1) / 2)}
        pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

        scores = None
        ############################################################################
        #                          任务: 实现前向传播                                #
        #                    计算每类得分,将其存放在scores中                          #
        ############################################################################
        # 组合卷积层:卷积,ReLU,池化
        conv_forward_out_1, cache_forward_1 = conv_relu_pool_forward(X, 
                self.params['W1'], self.params['b1'], conv_param, pool_param)
        # affine层
        affine_forward_out_2, cache_forward_2 = affine_forward(conv_forward_out_1,
                self.params['W2'], self.params['b2'])
        # relu层
        affine_relu_2, cache_relu_2 = relu_forward(affine_forward_out_2)
        # affine层
        scores, cache_forward_3 = affine_forward(affine_relu_2, self.params['W3'],
                self.params['b3'])
        ############################################################################
        #                           结束编码                                        #
        ############################################################################
        if y is None:
            return scores
            
        loss, grads = 0, {}
        ############################################################################
        #                        任务:实现反向转播                                   #
        #                      注意:别忘了权重衰减项                                 #
        ############################################################################
        loss, dout = softmax_loss(scores, y)
        loss += self.reg*0.5*(np.sum(self.params['W1']**2)
                              +np.sum(self.params['W2']**2)
                              +np.sum(self.params['W3']**2))
        dX3, grads['W3'], grads['b3'] = affine_backward(dout, cache_forward_3)
        dX2 = relu_backward(dX3, cache_relu_2)
        dX2, grads['W2'], grads['b2'] = affine_backward(dX2, cache_forward_2)
        dX1, grads['W1'], grads['b1'] = conv_relu_pool_backward(dX2, cache_forward_1)
        grads['W3'] = grads['W3']+self.reg*self.params['W3']
        grads['W2'] = grads['W2']+self.reg*self.params['W2']
        grads['W1'] = grads['W1']+self.reg*self.params['W1']
        ############################################################################
        #                          结束编码                                        #
        ############################################################################
        return loss, grads
    

posted on 2021-11-22 11:39  醉一心  阅读(107)  评论(0)    收藏  举报