import torch
from torch import nn
from d2l import torch as d2l
batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)
# 下一步是初始化模型参数。 我们从标准差为 的高斯分布中提取权重,
# 并将偏置项设为 超参数num_hidden定义隐藏单元的数量, 实例化与更新门、重置门、候选隐状态和输出层相关的所有权重和偏置。
def get_params(vocab_size, num_hidden, device):
num_inputs = num_outputs = vocab_size
def normal(shape):
return torch.randn(size=shape, device=device)*0.01
def three():
return (normal((num_inputs, num_hidden)),
normal((num_hidden, num_hidden)),
torch.zeros(num_hidden, device=device))
w_xz, w_hz, b_z = three() # 更新门参数
w_xr, w_hr, b_r = three() # 重置门参数
w_xh, w_hh, b_h = three() # 候选隐状态参数
# 输出层参数
w_hq = normal((num_hidden, num_outputs))
b_q = torch.zeros(num_outputs, device=device)
# 附加梯度
params = [w_xz, w_hz, b_z, w_xr, w_hr, b_r, w_xh, w_hh, b_h, w_hq, b_q]
for param in params:
param.requires_grad_(True)
return params
def init_gru_state(batch_size, num_hidden, device):
return (torch.zeros((batch_size, num_hidden), device=device),)
def gru(inputs, state, params):
w_xz, w_hz, b_z, w_xr, w_hr, b_r, w_xh, w_hh, b_h, w_hq, b_q = params
h, = state
outputs = []
for x in inputs:
z = torch.sigmoid((x @ w_xz) + (h @ w_hz) + b_z) # @ 是矩阵乘法
r = torch.sigmoid((x @ w_xr) + (h @ w_hr) + b_r)
h_tilda = torch.tanh((x @ w_xh) + ((r * h) @ w_hh) + b_h)
h = z * h + (1 - z) * h_tilda
y = h @ w_hq + b_q
outputs.append(y)
return torch.cat(outputs, dim=0), (h,)
vocab_size, num_hidden, device = len(vocab), 256, d2l.try_gpu()
num_epochs, lr = 500, 1
model = d2l.RNNModelScratch(len(vocab), num_hidden, device, get_params,
init_gru_state, gru)
d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)