mindspore 自定义优化器
需求
某些模型使用了一些独特的优化器所以需要自定义
MindSpore和PyTorch都支持用户基于python基本语法及相关算子自定义优化器。在PyTorch中,通过重写
__init__和step方法,用户可以根据需求自定义优化器,具体用法可以参考这篇教程。MindSpore也支持类似用法,以Momentum为例,使用基础的小算子构建:from mindspore import Parameter, ops, nn class MomentumOpt(nn.Optimizer): def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, use_nesterov=False): super(MomentumOpt, self).__init__(learning_rate, params, weight_decay, loss_scale) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.moments = self.parameters.clone(prefix="moments", init="zeros") self.assign = ops.Assign() def construct(self, gradients): params = self.parameters moments = self.moments success = None for param, mom, grad in zip(params, moments, gradients): # 小算子表达 update = self.momentum * param + mom + self.learning_rate * grad success = self.assign(param, update) return success
官方文档缺陷
mindspore文档没有给出step的使用方法,如果直接用int类型记录step在动态图没问题,但是在静态图的话会失败
解决办法
用Parameter 来记录步数
我自己自定义的优化器
class RiemannianAdam(Optimizer): """RiemannianAdam optimizer""" @opt_init_args_register def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, \ weight_decay=0.0): """init fun""" super(RiemannianAdam, self).__init__(learning_rate=learning_rate, parameters=params, weight_decay=weight_decay) self.beta1 = Tensor(np.array([beta1]).astype(np.float32)) self.beta2 = Tensor(np.array([beta2]).astype(np.float32)) self.eps = Tensor(np.array([eps]).astype(np.float32)) self.sum = ReduceSum(keep_dims=True) self.sumFalse = ReduceSum(keep_dims=False) self.sqrt = Sqrt() self.add = Add() self.exp_avg = self.parameters.clone(prefix='exp_avg', init='zeros') self.exp_avg_sq = self.parameters.clone(prefix='exp_avg_sq', init='zeros') self.step = Parameter(Tensor(0, mstype.int32), name='step') self.assign = Assign() self.pow = Pow() self.mul = Mul() def construct(self, gradients): """class construction""" beta1 = self.beta1 beta2 = self.beta2 eps = self.eps learning_rate = self.get_lr() params = self.parameters success = None step = self.step for exp_avg, exp_avg_sq, param, grad in zip(self.exp_avg, self.exp_avg_sq, params, gradients): point = param if grad is None: continue exp_avg_update = self.add(self.mul(exp_avg, beta1), (1 - beta1) * grad) exp_avg_sq_update = self.add(self.mul(exp_avg_sq, beta2), (1 - beta2) * (self.sum(grad * grad, -1)) ) denom = self.add(self.sqrt(exp_avg_sq_update), eps) step += 1 bias_cor1 = 1 - self.pow(beta1, step) bias_cor2 = 1 - self.pow(beta2, step) step_size = learning_rate * bias_cor2 ** 0.5 / bias_cor1 direction = exp_avg_update / denom new_point = point - step_size * direction step += 1 self.assign(exp_avg, exp_avg_update) self.assign(exp_avg_sq, exp_avg_sq_update) success = self.assign(param, new_point) self.assign(self.step, step) return success
浙公网安备 33010602011771号