LSTM-昇腾平台全框架指南[2]
涵盖了keras,tensorflow,torch,ms多框架的LSTM模型。
4 torch+NPU-Ai core模型训练
从这开始真正调用昇腾平台NPU-Ai core资源,加速训练
在3的基础上面修改
# 1. 创建模型,最后加了.npu()
model = AttentionLSTM(input_size=input_size, hidden_size=hidden_size, output_size=output_size, batch_size=batch_size).npu()
# 2. 每次加载这个数据加载器的时候 数据放到NPU
for i, (inputs, labels) in enumerate(train_loader):
inputs,labels = inputs.npu(),labels.npu()
5 mindspore+NPU-HBM+NPU-Ai core模型训练
import sys, os, time
import pandas as pd
import numpy as np
import random
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from scipy import signal
import pickle
import mindspore
import mindspore as ms
from mindvision.engine.callback import LossMonitor
import mindspore.nn as nn
from mindspore.ops.function import broadcast_to
from mindspore.ops import grad
import mindspore.dataset as ds
from mindspore.context import ParallelMode
from mindspore.nn.wrap.cell_wrapper import WithLossCell
from mindspore import context, ParameterTuple
from mindspore.train import Model, CheckpointConfig, ModelCheckpoint, LossMonitor
from mindspore.communication import init
from mindspore.train.callback import EarlyStopping
5.1 配置mindspore使用的处理器
# .py 增加全局代码,使用Ascend,device_id=0如果不行,查看自己的处理器逻辑序号,注意不是物理序号
# 在老版本Aicore处理器应该选择‘DaVinci’,但是现在淘汰了这个参数,Ascend自动选择HBM和Aicore
mindspore.set_context(mode=ms.GRAPH_MODE, device_target="Ascend", device_id=0)
init()
# 序号获取/查看, 终端里面
ls /dev/davinci*
# 看打印的*对应的号即是物理序号
# 将得到的对应序号换掉下面*, 终端里面
npu-smi info -t phyid-remap -p *
# 即可查看详情
| 字段 | 说明 |
|---|---|
| Chip Physical ID | 芯片物理ID |
| Chip Logic ID | 芯片逻辑ID |
| NPU ID | 设备ID |
| Chip ID | 芯片ID |
5.2 网络构建调整
class AttentionLSTM(nn.Cell):
def __init__(self, input_size, hidden_size, output_size, batch_size):
super(AttentionLSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.batch_size = batch_size
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True,bidirectional=True)
self.attention = nn.Dense(hidden_size*2, hidden_size)
self.out = nn.Dense(hidden_size*2, output_size)
def construct(self, input):
# LSTM编码
output, (hidden, cell) = self.lstm(input)
# 计算注意力权重
softmax = mindspore.ops.Softmax(axis=1)
attn_weights = softmax(self.attention(output))
# 计算注意力向量
transpose = mindspore.ops.Transpose()
batmatmul = mindspore.ops.BatchMatMul()
attn_vectors =batmatmul(transpose(attn_weights, (0, 2, 1)), output)
# 将注意力向量和LSTM输出相加并通过线性层得到最终输出
relu = nn.ReLU()
output = relu(attn_vectors)
output = self.out(output)
return output
5.3 数据加载器调整
# 模型输入的数据格式有一定要求,按照mindspore样例做了测试
# 总结出以下结论
# train_x, train_y
# val_x, val_y
# test_x, test_y 这三个都是任意格式
# 设置构建器
class MyIterable:
def __init__(self, data, label):
self._index = 0
self._data = data
self._label = label
def __next__(self):
if self._index >= len(self._data):
raise StopIteration
else:
item = (self._data[self._index], self._label[self._index])
self._index += 1
return item
def __iter__(self):
self._index = 0
return self
def __len__(self):
return len(self._data)
train_generator = ds.GeneratorDataset(source=MyIterable(train_x.asnumpy(),train_y.asnumpy()), column_names=["data", "label"], shuffle=True)
val_generator = ds.GeneratorDataset(source=MyIterable(val_x.asnumpy(),val_y.asnumpy()), column_names=["data", "label"], shuffle=True)
test_generator = ds.GeneratorDataset(source=MyIterable(test_x.asnumpy(),test_y.asnumpy()), column_names=["data", "label"])
train_loader = train_generator.batch(batch_size=batch_size)
val_loader = val_generator.batch(batch_size=batch_size)
test_loader = test_generator.batch(batch_size=batch_size)
train_loader = train_loader.repeat(1)
val_loader = val_loader.repeat(1)
test_loader = test_loader.repeat(1)
# 最后一行虽然重复了一次,看上去什么也没干,实际上调整了数据类型,此种类型可以投入网络直接训练,batch处理后的不行
5.4 网络训练
input_size = ...
hidden_size = ...
output_size = ...
batch_size = ...
lr = ...
network = AttentionLSTM(input_size=input_size, hidden_size=hidden_size, output_size=output_size, batch_size=batch_size)
# 定义优化器
optimizer = nn.Adam(params=network.trainable_params(),learning_rate=lr)
net_loss = nn.MSELoss()
# 初始化模型参数,metrics指标是验证集需要的
model = ms.Model(network, loss_fn=net_loss, optimizer=optimizer, metrics={'loss','mae','mse'})
history = model.fit(
epoch=50,
train_dataset = train_loader,
valid_dataset = val_loader,
callbacks=[ms.LossMonitor(train_x.shape[0]//batch_size)]
)
5.5 模型预测
y_pred = model.predict(test_x)
y_pred = y_pred.mean(axis=1)
y_true = test_y.mean(axis=1)
6. 总结
早停法建议一开始训练不要使用,先尝试几百个批次之类固定训练,多次训练感觉容易过拟合再考虑,如上实验中,有时下降的慢或者早停的耐心参数过低会导致对于整个训练过程的不明确,不知道过拟合还是欠拟合,所以,慎用啊。
batch-size的设置有时和网络模型大小也是有关系的,如设置16,模型每steps16×400×32,如果网络构建没有固定输出,输入之类,batch-size的大小也是有一定设计理念的。

浙公网安备 33010602011771号