# 1 Softmax（10分）

## （a）（5分）

\begin{align*}
\mbox{softmax(x + c)}_i &= \frac{e^{x_i + c}}{\sum_j e^{x_j + c}} \\
&= \frac{e^c e^{x_i}}{\sum_j e^c e^{x_j}} \\
&= \frac{e^c e^{x_i}}{e^c \sum_j e^{x_j}} \\
&= \frac{e^{x_i}}{\sum_j e^{x_j}} \\
&= \mbox{softmax(x)}_i
\end{align*}

## （b）（5分）

if len(x.shape) > 1:
# Matrix
x = x - np.max(x, 1).reshape(-1 ,1) # 每个元素都减去所在列对应的最大值
x_exp = np.exp(x)
x = x_exp / np.sum(x_exp, 1)
else:
# Vector
x = x - np.max(x)
x_exp = np.exp(x)
x = x_exp / np.sum(x_exp)


# 2 神经网络基础（30分）

## （a）（3分）

\begin{align*}
\frac{\partial }{\partial x} \sigma(x) &= \frac{\partial \frac{1}{1+\exp(-x)}}{\partial (1+\exp(-x))}\cdot \frac{\partial (1+\exp(-x))}{\partial x} \\
&= -\frac{1}{(1+\exp(-x))^2} \cdot (-\exp(-x)) \\
&= \frac{1}{1+\exp(-x)} \cdot \frac{\exp(-x)}{1+\exp(-x)} \\
&= \sigma(x)(1-\sigma(x))
\end{align*}

## （b）（3分）

\begin{align*}
\hat{y}_k = \frac{\exp(\theta_k)}{\sum_j \exp(\theta_j)}
\end{align*}

\begin{align*}
\frac{\partial \hat{y}_k}{\partial \theta_k} &= \frac{\partial }{\partial \theta_k} \frac{\exp(\theta_k)}{\sum_j \exp(\theta_i)} \\
&= \frac{\exp(\theta_k)(\sum_j \exp(\theta_i)) - \exp(\theta_k)\exp(\theta_k)}{(\sum_j \exp(\theta_i))^2} \\
&= \frac{\exp(\theta_k)}{\sum_j \exp(\theta_i)}(1 - \frac{\exp(\theta_k)}{\sum_j \exp(\theta_i)}) \\
&= \hat{y}_k(1 - \hat{y}_k)
\end{align*}

\begin{align*}
\frac{\partial \hat{y}_k}{\partial \theta_i} &= \frac{\partial }{\partial \theta_i} \frac{\exp(\theta_k)}{\sum_j \exp(\theta_j)} \\
&= \frac{0 \cdot \sum_j \exp(\theta_j) - \exp(\theta_k)\exp(\theta_i)}{(\sum_j \exp(\theta_j))^2} \\
&= -\hat{y}_k\hat{y}_i
\end{align*}

\begin{align*}
CE(y,\hat{y}) &= -\sum_iy_i\log(\hat{y}_i) \\
&=-\log(\hat{y}_k)
\end{align*}

\begin{align*}
\frac{\partial }{\partial \theta} CE(y,\hat{y}) = -\frac{\partial }{\partial \theta} \log(\hat{y}_k)
\end{align*}

\begin{align*}
\frac{\partial }{\partial \theta_k} CE(y,\hat{y}) &= -\frac{\partial }{\partial \theta_k} \log(\hat{y}_k) \\
&= -\frac{1}{\hat{y}_k} \cdot \hat{y}_k(1 - \hat{y}_k) \\
&= \hat{y}_k - 1
\end{align*}

\begin{align*}
\frac{\partial }{\partial \theta_i} CE(y,\hat{y}) &= -\frac{\partial }{\partial \theta_i} \log(\hat{y}_k) \\
&= -\frac{1}{\hat{y}_k} \cdot (-\hat{y}_k\hat{y}_i) \\
&= \hat{y}_i
\end{align*}

# 3 word2vec（40分）

## （a）（3分）

\begin{align*}
CE(y,\hat{y}) &= -\sum_iy_i\log(\hat{y}_i) \\ &=-\log(\hat{y}_o)
\end{align*}

\begin{align*}
\frac{\partial }{\partial v_c} CE(y,\hat{y}) &= -\frac{\partial }{\partial v_c} \log(y_o) \\
&= -\frac{1}{y_o} \frac{\partial }{\partial v_c} \frac{\exp(u_o^Tv_c)}{\sum_{w=1}^{V}\exp(u_w^Tv_c)} \\
&= -\frac{1}{y_o} \frac{1}{(\sum_{w=1}^{V}\exp(u_w^Tv_c))^2} ((\sum_{w=1}^{V}\exp(u_w^Tv_c))\exp(u_o^Tv_c)u_o -\exp(u_o^Tv_c)\sum_{w=1}^V \exp(u_w^Tv_c)u_w) \\
&= -\frac{1}{y_o} \frac{\exp(u_o^Tv_c)}{(\sum_{w=1}^{V}\exp(u_w^Tv_c))^2} ((\sum_{w=1}^{V}\exp(u_w^Tv_c))u_o - \sum_{w=1}^V \exp(u_w^Tv_c)u_w) \\
&= - \frac{1}{\sum_{w=1}^{V}\exp(u_w^Tv_c)} ((\sum_{w=1}^{V}\exp(u_w^Tv_c))u_o - \sum_{w=1}^V \exp(u_w^Tv_c)u_w) \\
&= -(u_o - \frac{\sum_{w=1}^V \exp(u_w^Tv_c)u_w}{\sum_{w=1}^{V}\exp(u_w^Tv_c)}) \\
&= \frac{\sum_{w=1}^V \exp(u_w^Tv_c)u_w}{\sum_{w=1}^{V}\exp(u_w^Tv_c)} - u_o
\end{align*}

## （b）（3分）

\begin{align*}
\frac{\partial }{\partial u_k} CE(y,\hat{y}) &= -\frac{\partial }{\partial u_k} \log(y_o) \\
&= -\frac{1}{y_o} \frac{\partial }{\partial u_k} \frac{\exp(u_o^Tv_c)}{\sum_{w=1}^{V}\exp(u_w^Tv_c)} \\
&= -\frac{1}{y_o} \frac{1}{(\sum_{w=1}^{V}\exp(u_w^Tv_c))^2} ((\sum_{w=1}^{V}\exp(u_w^Tv_c))\exp(u_o^Tv_c)v_c - \exp(u_o^Tv_c)\exp(u_k^Tv_c)v_c) \\
&= -\frac{1}{\sum_{w=1}^{V}\exp(u_w^Tv_c)} ((\sum_{w=1}^{V}\exp(u_w^Tv_c))v_c - \exp(u_k^Tv_c)v_c) \\
&= -(v_c - \hat{y}_k v_c) \\
&= (\hat{y}_k - 1)v_c
\end{align*}

\begin{align*}
\frac{\partial }{\partial u_k} CE(y,\hat{y}) &= -\frac{\partial }{\partial u_k} \log(y_o) \\
&= -\frac{1}{y_o} \frac{\partial }{\partial u_k} \frac{\exp(u_o^Tv_c)}{\sum_{w=1}^{V}\exp(u_w^Tv_c)} \\
&= -\frac{1}{y_o} \frac{1}{(\sum_{w=1}^{V}\exp(u_w^Tv_c))^2} (0 - \exp(u_o^Tv_c)\exp(u_k^Tv_c)v_c) \\
&= -\frac{1}{\sum_{w=1}^{V}\exp(u_w^Tv_c)} (- \exp(u_k^Tv_c)v_c) \\
&= \hat{y}_kv_c
\end{align*}

\begin{align*}
\frac{\partial }{\partial u_k} CE(y,\hat{y}) = (\hat{y}_k - y_k)v_c
\end{align*}

## （c） （3分）

\begin{align*}
\frac{\partial }{\partial u_o} J_{neg-sample} &= -\frac{\partial }{\partial u_o} \log(\sigma(u_o^Tv_c)) \\
&= -\frac{1}{\sigma(u_o^Tv_c)} \frac{\partial }{\partial u_o} \sigma(u_o^Tv_c) \\
&= -\frac{1}{\sigma(u_o^Tv_c)} \sigma(u_o^Tv_c)(1-\sigma(u_o^Tv_c))v_c \\
&= (\sigma(u_o^Tv_c)-1)v_c
\end{align*}

\begin{align*}
\frac{\partial }{\partial u_k} J_{neg-sample} &= -\frac{\partial }{\partial u_k} \sum_{i=1}^{K}\log(\sigma(-u_i^Tv_c)) \\
&= -\frac{\partial }{\partial u_k} \log(\sigma(-u_k^Tv_c))  \\
&= -\frac{1}{\sigma(-u_k^Tv_c)} \frac{\partial }{\partial u_k} \sigma(-u_k^Tv_c) \\
&= (1 - \sigma(-u_k^Tv_c))v_c
\end{align*}

\begin{align*}
\frac{\partial }{\partial v_c} J_{neg-sample} &= -\frac{\partial }{\partial v_c}\{\log(\sigma(u_o^Tv_c)) + \sum_{i=1}^{K}\log(\sigma(-u_i^Tv_c))\} \\
&= (\sigma(u_o^Tv_c)-1)u_o + \sum_{i=1}^{K} (1 - \sigma(-u_i^Tv_c))u_i
\end{align*}

## （d）（8分）

1、使用softmax损失函数，再根据3（a）、3（b）算得的结果：

\begin{align*}
\frac{\partial }{\partial v_c} J_{softmax}(w_{t-m \cdots t+m}) &= \frac{\partial }{\partial v_c} \sum_{-m \leq j \leq m, j \neq 0} F(w_{t+j}, v_c) \\
&= \sum_{-m \leq j \leq m, j \neq 0} (\frac{\sum_{w=1}^V \exp(u_w^Tv_c)u_w}{\sum_{w=1}^{V}\exp(u_w^Tv_c)} - u_{t+j}) \\
\end{align*}

\begin{align*}
\frac{\partial }{\partial u_{t+j}} CE(y,\hat{y}) = (\hat{y}_{t+j} - y_{t+j})v_c
\end{align*}

2、并使用negative sampling损失函数，再根据3（c）算得的结果：

\begin{align*}
\frac{\partial }{\partial v_c} J_{skip-gram}(w_{t-m \cdots t+m}) &= \frac{\partial }{\partial v_c} \sum_{-m \leq j \leq m, j \neq 0} F(w_{t+j}, v_c) \\
&= \frac{\partial }{\partial v_c} \sum_{-m \leq j \leq m, j \neq 0} \{ -\log(\sigma(u_{t+j}^T)v_c) - \sum_{i=1}^{K_{j}} \log(\sigma(-u_{ji}^Tv_c))\} \\
&= \sum_{-m \leq j \leq m, j \neq 0} \{ (\sigma(u_{t+j}^Tv_c)-1)u_{t+j} + \sum_{i=1}^{K_{j}} (1 - \sigma(-u_{ji}^Tv_c))u_{ji} \}
\end{align*}

$\frac{\partial }{\partial u_{t+j}} J_{skip-gram}(w_{t-m \cdots t+m})$、$\frac{\partial }{\partial u_{ji}} J_{skip-gram}(w_{t-m \cdots t+m})$与上题中，对3（b）的改写是一样的。

# 4 情感分析（20分）

## d（3分）

1、GloVe的训练语料（Wikipedia data）要大得多

2、我的模型训练时，基本又有进行参数的调优。比如，词向量维度只有10，而GloVe词向量的维度至少是50

3、GloVe模型本身就要优于word2vec

## g（4分）

1. “dull , lifeless , and amateurishly assembled .”这基本都是负面词汇，这不知道为什么预测得这么离谱。难道是标点符号所占比例太大导致的？
2. “a lackluster , unessential sequel to the classic disney adaptation of j.m. barrie 's peter pan .”这句话出错情有可原吧，毕竟逻辑回归没那么智能。人类一眼就能看出，关键词是前两个形容词lackluster和unessential，及其负面。但无奈的是，后面跟真实情感无关的一些词比较正面，比如classic。

“chilling but uncommercial look into the mind of jeffrey dahmer , serial killer” 转折之后是重点，可以逻辑回归算法不知道这个常识。

posted on 2018-06-19 11:38  royhoo  阅读(1044)  评论(0编辑  收藏  举报