Pytorch 激活函数&SOFTMAX
Activation Functions
Sigmoid/Logstic 可能有梯度离散现象(导数接近0时)
MSE
\[\begin{aligned}
&loss = \sum[y-(xw+b)]^2\\
&L2 -norm = ||y-(xw+b)||_2\\
&loss = norm(y-(xw+b))^2\\
\end{aligned}
\]
Derivative
\[\begin{aligned}
&loss = \sum[y-f_{\theta}(x)]^2\\
&\frac{\bigtriangledown{loss}}{\bigtriangledown{\theta}}=2\sum[y-f_{\theta}(x)]*\frac{\bigtriangledown{f_{\theta}(x)}}{\bigtriangledown{\theta}}\\
\end{aligned}
\]
import torch
import torch.nn.functional as F
# autograd.grad
x = torch.ones(1)
w = torch.full([1], 2.)
mse = F.mse_loss(torch.ones(1), x*w)
mse
tensor(1.)
torch.autograd.grad(y*,[x1, x2, ..., xn])
# torch.autograd.grad(mse, [w])
"""
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_2932/274727774.py in <module>
----> 1 torch.autograd.grad(mse, [w])
~\AppData\Local\Programs\Python\Python37\lib\site-packages\torch\autograd\__init__.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)
234 return Variable._execution_engine.run_backward(
235 outputs, grad_outputs_, retain_graph, create_graph,
--> 236 inputs, allow_unused, accumulate_grad=False)
237
238
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
"""
'\n---------------------------------------------------------------------------\nRuntimeError Traceback (most recent call last)\n~\\AppData\\Local\\Temp/ipykernel_2932/274727774.py in <module>\n----> 1 torch.autograd.grad(mse, [w])\n\n~\\AppData\\Local\\Programs\\Python\\Python37\\lib\\site-packages\torch\x07utograd\\__init__.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)\n 234 return Variable._execution_engine.run_backward(\n 235 outputs, grad_outputs_, retain_graph, create_graph,\n--> 236 inputs, allow_unused, accumulate_grad=False)\n 237 \n 238 \n\nRuntimeError: element 0 of tensors does not require grad and does not have a grad_fn\n'
w.requires_grad_(), w, w.shape
(tensor([2.], requires_grad=True),
tensor([2.], requires_grad=True),
torch.Size([1]))
# torch.autograd.grad(mse, [w])
"""
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
......
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
"""
'\n---------------------------------------------------------------------------\nRuntimeError Traceback (most recent call last)\n......\nRuntimeError: element 0 of tensors does not require grad and does not have a grad_fn\n'
mse = F.mse_loss(torch.ones(1), x*w)
torch.autograd.grad(mse, [w])
(tensor([2.]),)
# loss.backward
mse = F.mse_loss(torch.ones(1), x*w)
mse.backward()
w.grad, w.grad.norm()
(tensor([2.]), tensor(2.))
softmax
\[\begin{aligned}
\left[
\begin{array}{c}
y_1 \\
y_2 \\
\vdots \\
y_n
\end{array}
\right]
&\rightarrow \left[S(y_i)=\frac{e^{y_i}}{\sum_{j}e^{y_j}}\right]\rightarrow
\left[
\begin{array}{c}
p_1 \\
p_2 \\
\vdots \\
p_n
\end{array}
\right]\\
&\sum_ip(i)=p(0)+p(1)+\dots+p(i)
\end{aligned}
\]
Derivative
\[\begin{aligned}
&p_i=\frac{e^{a_i}}{\sum^{N}_{k=1}e^{a_i}} \qquad & \green{when\space i\neq j}\\
&\frac{\delta p_i}{\delta a_j}=\delta\frac{\frac{e^{a_i}}{\sum^{N}_{k=1}e^{a_k}}}{\delta a_j} \qquad &\frac{\delta\frac{e^{a_i}}{\sum^{N}_{k=1}e^{a_k}}}{\delta a_j} =\frac{e_{a_i}\sum_{k=1}^{N}e^{a_k}-e^{a_j}e^{a_i}}{(\sum_{k=1}^{N}e^{a_k})^2}\\
&f(x)=\frac{g(x)}{h(x)} \qquad &=\frac{e^{a_i}(\sum^{N}_{k=1}e^{a_k}-e^{a_j})}{(\sum_{k=1}^{N}e^{a_k})^2}\\
&f'(x)=\frac{g'(x)h(x)-h'(x)g(x)}{h(x)^2} \qquad &=\frac{e^{a_j}}{\sum_{k=1}^{N}e^{a_k}} * \frac{(\sum^{N}_{k=1}e^{a_k}-e^{a_j})}{\sum_{k=1}^{N}e^{a_k}}\\
&g(x)=e^{a_i} \qquad &=p_i(1-p_j)\\
&h(x)=\sum^{N}_{k=1}e^{a_{i}}\\
\end{aligned}
\]
Derivative
\[\begin{aligned}
&p_i=\frac{e^{a_i}}{\sum^{N}_{k=1}e^{a_i}} \qquad & \green{when\space i\neq j}\\
&\frac{\delta p_i}{\delta a_j}=\delta\frac{\frac{e^{a_i}}{\sum^{N}_{k=1}e^{a_k}}}{\delta a_j} \qquad &\frac{\delta\frac{e^{a_i}}{\sum^{N}_{k=1}e^{a_k}}}{\delta a_j} =\frac{0-e^{a_j}e^{a_i}}{(\sum_{k=1}^{N}e^{a_k})^2}\\
&f(x)=\frac{g(x)}{h(x)} \qquad &=\frac{-e^{a_j}}{\sum_{k=1}^{N}e^{a_k}} * \frac{e^{a_j}}{\sum_{k=1}^{N}e^{a_k}}\\
&f'(x)=\frac{g'(x)h(x)-h'(x)g(x)}{h(x)^2} \qquad &=-p_i p_j\\
&g(x)=e^{a_i}\\
&h(x)=\sum^{N}_{k=1}e^{a_{i}}\\
\end{aligned}
\]
a = torch.rand(3)
a.requires_grad_()
tensor([0.2645, 0.5876, 0.9008], requires_grad=True)
#p = F.softmax(a, dim=0)
#p.backward()
'''
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
......
RuntimeError: grad can be implicitly created only for scalar outputs
'''
'\n---------------------------------------------------------------------------\nRuntimeError Traceback (most recent call last)\n......\nRuntimeError: grad can be implicitly created only for scalar outputs\n'
p = F.softmax(a, dim=0)
torch.autograd.grad(p[1], [a], retain_graph=True)
(tensor([-0.0757, 0.2188, -0.1431]),)
torch.autograd.grad(p[2], [a])
(tensor([-0.1036, -0.1431, 0.2467]),)
Derivative
\[\begin{aligned}
&\frac{\delta p_i}{\delta a_j}=
\begin{cases}
&p_i(1-p_j)\space &i=j\\
&-p_j p_i\space &i\neq j
\end{cases}\\
Or\space \space using \space Kronecker \space delta &\delta ij=
\begin{cases}
&1 \space\space\space if\space\space\space i=j\\
&0 \space\space\space if\space\space\space i\neq j\\
\end{cases}\\
&\frac{\delta p_i}{\delta a_j}=p_i(\delta_{ij}-p_j)
\end{aligned}
\]
#Carrawayang written
浙公网安备 33010602011771号