GMM 模型
GMM 模型 EM 算法推导
设 \(\boldsymbol{x}_i|\boldsymbol{c}_i\sim\mathcal{N}(\boldsymbol{\mu}\boldsymbol{c}_i,\boldsymbol{\Sigma}_{\boldsymbol{c}_i}),\boldsymbol{c}_i\sim \text{Categorial}(\pi_1,\cdots,\pi_K)\),其中 \(\boldsymbol{x}=(\boldsymbol{x}_1,\cdots,\boldsymbol{x}_n)\in\mathbb{R}^{d\times n}\) 为观测变量,参数 \(\boldsymbol{c}=(\boldsymbol{c}_1,\cdots,\boldsymbol{c}_n)\in\mathbb{R}^{K\times n}, \boldsymbol{\mu}=(\boldsymbol{\mu}_1,\cdots,\boldsymbol{\mu}_K)\in\mathbb{R}^{d\times K}\),因此模型参数为 \(\boldsymbol{\theta}=(\pi_k,\boldsymbol{\mu}_k,\boldsymbol{\Sigma}_k,k=1,\cdots,K)\)。
对于密度函数 \(q(\boldsymbol{c})\),观测对数似然 \(I_o(\boldsymbol{\theta},\boldsymbol{x})=\log p(\boldsymbol{x};\boldsymbol{\theta})=\log\int p(\boldsymbol{x},\boldsymbol{c};\boldsymbol{\theta})\mathrm{d}\boldsymbol{c}\geqslant\mathbb{E}_q\log\{\frac{p(\boldsymbol{c},\boldsymbol{x};\boldsymbol{\theta})}{q(\boldsymbol{c})}\}:=\operatorname{ELBO}(\boldsymbol{\theta},q)=\mathbb{E}_q\log p(\boldsymbol{c},\boldsymbol{x};\boldsymbol{\theta})-\mathbb{E}_q\log q(\boldsymbol{c})=-\mathrm{KL(q(\boldsymbol{c}),p(\boldsymbol{c}|\boldsymbol{x};\boldsymbol{\theta}))}+\log p(\boldsymbol{x};\boldsymbol{\theta})\)。
记 \(\boldsymbol{c}\) 的后验概率 \(q(\boldsymbol{c}_i=\boldsymbol{1}_k)=p(\boldsymbol{c}_i=\boldsymbol{1}_k|\boldsymbol{x};\boldsymbol{\theta})=p(\boldsymbol{c}_i=\boldsymbol{1}_k|\boldsymbol{x}_i;\boldsymbol{\theta}):=q_{ik}\),那么 \(\displaystyle\operatorname{ELBO}(\boldsymbol{\theta},q)=\sum_i\sum_k q_{ik}\log p(\boldsymbol{x}_i,\boldsymbol{c}_i=\boldsymbol{1}_k;\boldsymbol{\theta})-\sum_i\sum_k q_{ik}\log q_{ik}=\sum_i\sum_k q_{ik}\log (\pi_k\mathcal{N}(\boldsymbol{x}_i;\boldsymbol{\mu}_k,\boldsymbol{\Sigma}_k))-\sum_i\sum_k q_{ik}\log q_{ik},(\hat{\boldsymbol{\theta}},\hat{q})=\argmax_{\sum_k \pi_k=1,\sum_k q_{ik}=1}\operatorname{ELBO}(\boldsymbol{\theta},q)\)。
EM 算法即使用块坐标上升算法求解该优化问题:
- E 步:\(\displaystyle q^r=\argmax_{\sum_k q_{ik}=1}\operatorname{ELBO}(\boldsymbol{\theta}^{r-1},q)\)
- 设 \(L(q,\boldsymbol{\lambda})=-\operatorname{ELBO}(\boldsymbol{\theta}^{r-1},q)+\sum_i \lambda_i(\sum_k q_{ik}-1)\),令 \(\frac{\partial L(q,\boldsymbol{\lambda})}{\partial q_{ik}}=-\log p(\boldsymbol{x}_i,\boldsymbol{c}_i=\boldsymbol{1}_k;\boldsymbol{\theta}^{r-1})+1+\log q_{ik}+\lambda_i=0\),联立 \(\sum_k q_{ik}=1\) 得 \(\displaystyle q_{ik}^r=\frac{p(\boldsymbol{x}_i,\boldsymbol{c}_i=\boldsymbol{1}_k;\boldsymbol{\theta}^{r-1})}{\sum_j p(\boldsymbol{x}_i,\boldsymbol{c}_i= \boldsymbol{1}_{j};\boldsymbol{\theta}^{r-1})}=\frac{\pi_k^{r-1}\mathcal{N}(\boldsymbol{x}_i;\boldsymbol{\mu}_k^{r-1},\boldsymbol{\Sigma}_k^{r-1})}{\sum_j \pi_j^{r-1}\mathcal{N}(\boldsymbol{x}_i;\boldsymbol{\mu}_j^{r-1},\boldsymbol{\Sigma}_k^{r-1})}\)。
- M 步:\(\displaystyle\boldsymbol{\theta}^r=\argmax_{\sum_k \pi_k=1}\operatorname{ELBO}(\boldsymbol{\theta},q^r)\)
- 设 \(L(\boldsymbol{\theta},\boldsymbol{\lambda})=-\operatorname{ELBO}(\boldsymbol{\theta},q^r)+ \lambda(\sum_k \pi_{k}-1)\),令
联立 \(\sum_k \pi_k=1\) 解得
代码实现
# 加载必要的包
library(mvtnorm)
library(ggplot2)
library(umap)
library(mclust)
data(iris)
X <- as.matrix(iris[, 1:4])
n <- nrow(X) # 样本量
d <- ncol(X) # 维度
K <- 3 # 聚类数
true_labels <- as.integer(iris$Species)
# GMM聚类-EM算法
# 初始化
gmm_em <- function(X, K, max_iter = 100, tol = 1e-6) {
n <- nrow(X)
d <- ncol(X)
# 初始化参数
Pi <- rep(1/K, K)
mu <- X[sample(n, K), ]
Sigma <- array(diag(d), dim = c(d, d, K))
elbo <- numeric(max_iter)
for (r in 1:max_iter) {
# E步:计算q^r
q <- matrix(1/K, n, K)
for (k in 1:K) {
q[, k] <- Pi[k] * dmvnorm(X, mu[k, ], Sigma[, , k])
}
q <- q / rowSums(q) # 归一化
# M步:更新参数
# 更新π_k
Nk <- colSums(q)
Pi <- Nk / n
# 更新μ_k
for (k in 1:K) {
mu[k, ] <- colSums(q[, k] * X) / Nk[k]
}
# 更新Σ_k
for (k in 1:K) {
X_centered <- sweep(X, 2, mu[k, ], "-")
Sigma[, , k] <- (t(X_centered) %*% (X_centered * q[, k])) / Nk[k]
Sigma[, , k] <- Sigma[, , k] + diag(1e-16, d) # 正则化
}
# 计算ELBO
for (i in 1:n){
for (k in 1:K){
elbo[r] <- elbo[r] + q[i, k] * (log(Pi[k]) + mvtnorm::dmvnorm(X[i, ], mu[k, ], Sigma[, , k], log = TRUE)) - q[i, k] * log(q[i, k] + 1e-16)
}
}
# 收敛检查
if (r > 1 && abs(elbo[r] - elbo[r-1]) < tol) break
}
list(cluster = apply(q, 1, which.max),
elbo = elbo[1:r],
params = list(pi = Pi, mu = mu, Sigma = Sigma))
}
# 运行算法
set.seed(1)
result <- gmm_em(X, K)
conf_matrix <- table(true_labels, result$cluster)
mapping <- apply(conf_matrix, 2, function(x) which.max(x))
result$cluster <- mapping[result$cluster]
cat("Accuracy:", sum(true_labels==result$cluster)/n, "\n")
# 计算Adjusted Rand Index (ARI)
ari <- adjustedRandIndex(true_labels, result$cluster)
cat("Adjusted Rand Index (ARI):", ari, "\n")
# 手动计算NMI
nmi <- function(true_labels, predicted_labels) {
# 计算混淆矩阵
conf_matrix <- table(predicted_labels, true_labels)
# 计算熵
entropy <- function(p) {
p <- p[p > 0]
return(-sum(p * log2(p)))
}
# 计算互信息
mutual_information <- function(conf_matrix) {
n <- sum(conf_matrix)
I <- 0
for (i in 1:nrow(conf_matrix)) {
for (j in 1:ncol(conf_matrix)) {
if (conf_matrix[i, j] > 0) {
I <- I + conf_matrix[i, j] / n * log2(n * conf_matrix[i, j] / (rowSums(conf_matrix)[i] * colSums(conf_matrix)[j]))
}
}
}
return(I)
}
# 计算互信息
I <- mutual_information(conf_matrix)
# 计算熵
H_X <- entropy(rowSums(conf_matrix) / sum(conf_matrix))
H_Y <- entropy(colSums(conf_matrix) / sum(conf_matrix))
# 计算NMI
nmi_value <- I / sqrt(H_X * H_Y)
return(nmi_value)
}
nmi_value <- nmi(true_labels, result$cluster)
cat("Normalized Mutual Information (NMI):", nmi_value, "\n")
# ELBO可视化
elbo_df <- data.frame(iteration = 1:length(result$elbo), elbo = result$elbo)
ggplot(elbo_df, aes(x = iteration, y = elbo)) +
geom_line(color = "steelblue") +
geom_point(color = "darkred") +
theme_bw()
# UMAP可视化
set.seed(1)
umap_fit <- umap(X)
umap_df <- data.frame(UMAP1 = umap_fit$layout[,1],
UMAP2 = umap_fit$layout[,2],
Cluster = as.factor(result$cluster),
Species = iris$Species)
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Cluster, shape = Species)) +
geom_point(size = 3, alpha = 0.8) +
scale_shape_manual(values = c(16, 17, 15)) +
labs(title = "GMM Clustering on Iris Data") +
theme_minimal()
GMM 模型 VB 算法推导
设 \(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu}\sim\mathcal{N}(\boldsymbol{\mu}\boldsymbol{c}_i,\boldsymbol{I}),\boldsymbol{c}_i\overset{i.i.d.}{\sim} \text{Categorial}(1/K,\cdots,1/K),\boldsymbol{\mu}_k\overset{i.i.d.}{\sim}\mathcal{N}(\boldsymbol{0},\sigma^2\boldsymbol{I}),i=1,\cdots,n,k=1,\cdots,K\),其中 \(\boldsymbol{x}=(\boldsymbol{x}_1,\cdots,\boldsymbol{x}_n)\in\mathbb{R}^{d\times n}\) 为观测变量,随机参数 \(\boldsymbol{c}=(\boldsymbol{c}_1,\cdots,\boldsymbol{c}_n)\in\mathbb{R}^{K\times n}, \boldsymbol{\mu}=(\boldsymbol{\mu}_1,\cdots,\boldsymbol{\mu}_K)\in\mathbb{R}^{d\times K}\),\(\sigma^2,K\) 已知,下面致力于求解随机参数的后验分布 \(p(\boldsymbol{c},\boldsymbol{\mu}|\boldsymbol{x})\)。
联合分布 \(p(\boldsymbol{x},\boldsymbol{c},\boldsymbol{\mu})=p(\boldsymbol{x}|\boldsymbol{c},\boldsymbol{\mu})p(\boldsymbol{c})p(\boldsymbol{\mu})=\prod_ip(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu})p(\boldsymbol{c}_i)\prod_kp(\boldsymbol{\mu}_k)\),
证据 \(p(\boldsymbol{x})=\int p(\boldsymbol{x}|\boldsymbol{c},\boldsymbol{\mu})p(\boldsymbol{c})p(\boldsymbol{\mu}) \mathrm{d}\boldsymbol{c}\mathrm{d}\boldsymbol{\mu}\) 难以求解,使用变分推断方法近似后验分布 \(p(\boldsymbol{c},\boldsymbol{\mu}|\boldsymbol{x})\),即 \(\displaystyle q^*(\boldsymbol{c},\boldsymbol{\mu})=\argmin_{q(\boldsymbol{c},\boldsymbol{\mu})\in\mathcal{Q}}\operatorname{KL}(q(\boldsymbol{c},\boldsymbol{\mu}),p(\boldsymbol{c},\boldsymbol{\mu})|\boldsymbol{x})\)。
由于 \(\operatorname{KL}(q(\boldsymbol{c},\boldsymbol{\mu}),p(\boldsymbol{c},\boldsymbol{\mu})|\boldsymbol{x})=-\mathbb{E}_q\log p(\boldsymbol{x},\boldsymbol{c},\boldsymbol{\mu})+\mathbb{E}_q \log p(\boldsymbol{c},\boldsymbol{\mu})+\log p(\boldsymbol{x})\),因此问题转化为优化变分下界,即 \(\displaystyle q^*(\boldsymbol{c},\boldsymbol{\mu})=\argmax_{q(\boldsymbol{c},\boldsymbol{\mu})\in\mathcal{Q}}\operatorname{ELBO}(q)=\mathbb{E}_q \log p(\boldsymbol{x},\boldsymbol{c},\boldsymbol{\mu})-\mathbb{E}_q \log q(\boldsymbol{c},\boldsymbol{\mu})\)。
设变分分布为 \(q(\boldsymbol{c},\boldsymbol{\mu})=q(\boldsymbol{c})q(\boldsymbol{\mu})\),其中 \(q(\boldsymbol{c})=\prod_i q(\boldsymbol{c}_i;\boldsymbol{\phi}_i)=\prod_i\prod_k\phi_{ik}^{c_{ik}}\),\(q(\boldsymbol{\mu})=\prod_k q(\boldsymbol{\mu}_k;\boldsymbol{m}_k,s_k^2\boldsymbol{I})=\prod_k \mathcal{N}(\boldsymbol{\mu}_k;\boldsymbol{m}_k,s_k^2\boldsymbol{I})\),贝叶斯参数 \(\boldsymbol{\gamma}=(\boldsymbol{m}_k,s_k^2,\phi_{ik},i=1,\cdots,n,k=1,\cdots,K)\)。
注意到 \(\displaystyle\log p(\boldsymbol{x},\boldsymbol{c},\boldsymbol{\mu})=\sum_i \big[\log p(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu})+\log p(\boldsymbol{c}_i)\big]+\sum_k \log p(\boldsymbol{\mu}_k),\log q(\boldsymbol{c},\boldsymbol{\mu})=\sum_i\log q(\boldsymbol{c}_i)+\sum_k \log q(\boldsymbol{\mu}_k)\),因此 \(\operatorname{ELBO}(q)=\operatorname{ELBO}(\boldsymbol{\gamma})=\mathbb{E}_q\sum_i \log p(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu})-\mathbb{E}_q\big\{\sum_i\big(\log p(\boldsymbol{c}_i)-\log q(\boldsymbol{c}_i)\big)+\sum_k\big(\log p(\boldsymbol{\mu}_k)-\log q(\boldsymbol{\mu}_k)\big)\bigm\}:=I_1+I_2\)。
另外,\(\log p(\boldsymbol{c}_i)=\sum_k c_{ik}\log(1/K),\log q(\boldsymbol{c}_i)=\sum_k c_{ik}\log(\phi_{ik}),\log p(\boldsymbol{\mu}_k)=-\frac{1}{2}(d\log 2\pi+d\log \sigma^2+\boldsymbol{\mu}_k^\top\boldsymbol{\mu}_k/\sigma^2),\log q(\boldsymbol{\mu}_k)=-\frac{1}{2}(d\log 2\pi+d\log s_k^2+(\boldsymbol{\mu}_k-\boldsymbol{m}_k)^\top(\boldsymbol{\mu}_k-\boldsymbol{m}_k)/s_k^2),\log p(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu})=-\frac{1}{2}(\boldsymbol{x}_i-\boldsymbol{\mu}\boldsymbol{c}_i)^\top(\boldsymbol{x}_i-\boldsymbol{\mu}\boldsymbol{c}_i)+\operatorname{const}=-\frac{1}{2}\sum_kc_{ik}(\boldsymbol{x}_i-\boldsymbol{\mu}_k)^\top(\boldsymbol{x}_i-\boldsymbol{\mu}_k)+\operatorname{const}\),因此 \(I_2=\sum_k\sum_i \phi_{ik}(\log(1/K)-\log\phi_{ik})+\frac{1}{2}\sum_k(d\log s_k^2+d-d\log \sigma^2-(\boldsymbol{m}_k^\top\boldsymbol{m}_k+ds_k^2)/\sigma^2),I_1=-\frac{1}{2}\sum_i\big[\boldsymbol{x}_i^\top\boldsymbol{x}_i+\sum_k\phi_{ik}(-2\boldsymbol{x}_i^\top\boldsymbol{m}_k+\boldsymbol{m}_k^\top\boldsymbol{m}_k+ds_k^2)\big]\)。
至此,问题转化为 \(\displaystyle q^*(\boldsymbol{c},\boldsymbol{\mu})=\argmax_{\boldsymbol{\boldsymbol{\gamma}}}\operatorname{ELBO}(\boldsymbol{\gamma})=-\frac{1}{2}\sum_i\big[\boldsymbol{x}_i^\top\boldsymbol{x}_i+\sum_k\phi_{ik}(-2\boldsymbol{x}_i^\top\boldsymbol{m}_k+\boldsymbol{m}_k^\top\boldsymbol{m}_k+ds_k^2)\big]+\sum_k\sum_i \phi_{ik}(\log(1/K)-\log\phi_{ik})+\frac{1}{2}\sum_k(d\log s_k^2+d-d\log \sigma^2-(\boldsymbol{m}_k^\top\boldsymbol{m}_k+ds_k^2)/\sigma^2)\)。
记 \(\boldsymbol{\theta}=(\boldsymbol{m}_k,{s_k^2},k=1,\cdots,K)\),使用块坐标上升算法求解该优化问题:
- \(\boldsymbol{c}\):\(\displaystyle \phi_{ik}^r=\argmax_{\sum_k \phi_{ik}=1}\operatorname{ELBO}(\boldsymbol{\theta}^{r-1},\boldsymbol{\phi})\)
- 设 \(L(\boldsymbol{\phi},\boldsymbol{\lambda})=-\operatorname{ELBO}(\boldsymbol{\theta}^{r-1},\boldsymbol{\phi})+\sum_i \lambda_i(\sum_k \phi_{ik}-1)\),令 \(\displaystyle\frac{\partial L(\boldsymbol{\phi},\boldsymbol{\lambda})}{\partial \phi_{ik}}=\frac{1}{2}(-2\boldsymbol{x}_i^\top\boldsymbol{m}_k^{r-1}+{\boldsymbol{m}_k^{r-1}}^\top\boldsymbol{m}_k^{r-1}+d{s_k^2}^{r-1})-\log(1/K)+1+\log\phi_{ik}+\lambda_i=0\),联立 \(\sum_k \phi_{ik}=1\) 得 \(\displaystyle \phi_{ik}^r=\frac{\operatorname{exp}\{\boldsymbol{x}_i^\top\boldsymbol{m}_k^{r-1}-0.5({\boldsymbol{m}_k^{r-1}}^\top\boldsymbol{m}_k^{r-1}+d{s_k^2}^{r-1})\}}{\sum_j \operatorname{exp}\{\boldsymbol{x}_i^\top\boldsymbol{m}_j^{r-1}-0.5({\boldsymbol{m}_j^{r-1}}^\top\boldsymbol{m}_j^{r-1}+d{s_j^2}^{r-1})}\)。
- \(\boldsymbol{\mu}\):\(\displaystyle\boldsymbol{\theta}^r=\argmax_{\boldsymbol{\theta}}\operatorname{ELBO}(\boldsymbol{\theta},\boldsymbol{\phi}^r)\)
- 令
解得
代码实现
# 加载必要的包
library(mvtnorm)
library(ggplot2)
library(umap)
library(mclust)
data(iris)
X <- as.matrix(iris[, 1:4])
n <- nrow(X) # 样本量
d <- ncol(X) # 维度
K <- 3 # 聚类数
true_labels <- as.integer(iris$Species)
# GMM聚类-VB算法
# 初始化
gmm_vb <- function(X, K, max_iter = 1000, tol = -1e-6) {
n <- nrow(X)
d <- ncol(X)
# 初始化参数
sigma <- 1
phi <- matrix(1/K, n, K)
m <- X[sample(n, K), ]
s_k2 <- 0.1
elbo <- numeric(max_iter)
for (r in 1:max_iter) {
# 计算phi^r
for (k in 1:K) {
phi[, k] <- exp( X %*% m[k, ] - as.numeric(0.5*((t(m[k, ]) %*% m[k, ] + d*s_k2)) ))
}
phi <- phi / rowSums(phi) # 归一化
# 更新m_k和s_k2
Nk <- colSums(phi)
for (k in 1:K) {
m[k, ] <- colSums(phi[, k] * X) / (Nk[k]+1/sigma^2)
s_k2 <- 1 / (Nk[k]+1/sigma^2)
}
# 计算ELBO
for (k in 1:K){
elbo[r] <- elbo[r] + 0.5*(d*log(s_k2)+d-d*log(sigma^2)-((t(m[k, ]) %*% m[k, ]+d*s_k2)/sigma^2))
for (i in 1:n){
elbo[r] <- elbo[r] + phi[i, k]*(log(1/K)-log(phi[i, k])) - 0.5*(phi[i, k]*(t(X[i, ]-m[k, ]) %*% (X[i, ]-m[k, ])+d*s_k2))
}
}
# 收敛检查
if (r > 1 && abs(elbo[r] - elbo[r-1]) < tol) break
}
list(cluster = apply(phi, 1, which.max),
elbo = elbo[1:r],
params = list(phi = phi, m = m, s_k2 = s_k2))
}
# 运行算法
set.seed(1)
result <- gmm_vb(X, K)
# 标签调整
conf_matrix <- table(true_labels, result$cluster)
mapping <- apply(conf_matrix, 2, function(x) which.max(x))
result$cluster <- mapping[result$cluster]
cat("Accuracy:", sum(true_labels==result$cluster)/n, "\n")
# 计算Adjusted Rand Index (ARI)
ari <- adjustedRandIndex(true_labels, result$cluster)
cat("Adjusted Rand Index (ARI):", ari, "\n")
# 手动计算NMI
nmi <- function(true_labels, predicted_labels) {
# 计算混淆矩阵
conf_matrix <- table(predicted_labels, true_labels)
# 计算熵
entropy <- function(p) {
p <- p[p > 0]
return(-sum(p * log2(p)))
}
# 计算互信息
mutual_information <- function(conf_matrix) {
n <- sum(conf_matrix)
I <- 0
for (i in 1:nrow(conf_matrix)) {
for (j in 1:ncol(conf_matrix)) {
if (conf_matrix[i, j] > 0) {
I <- I + conf_matrix[i, j] / n * log2(n * conf_matrix[i, j] / (rowSums(conf_matrix)[i] * colSums(conf_matrix)[j]))
}
}
}
return(I)
}
# 计算互信息
I <- mutual_information(conf_matrix)
# 计算熵
H_X <- entropy(rowSums(conf_matrix) / sum(conf_matrix))
H_Y <- entropy(colSums(conf_matrix) / sum(conf_matrix))
# 计算NMI
nmi_value <- I / sqrt(H_X * H_Y)
return(nmi_value)
}
nmi_value <- nmi(true_labels, result$cluster)
cat("Normalized Mutual Information (NMI):", nmi_value, "\n")
# ELBO可视化
elbo_df <- data.frame(iteration = 1:length(result$elbo), elbo = result$elbo)
ggplot(elbo_df, aes(x = iteration, y = elbo)) +
geom_line(color = "steelblue") +
geom_point(color = "darkred") +
theme_bw()
# UMAP可视化
set.seed(1)
umap_fit <- umap(X)
umap_df <- data.frame(UMAP1 = umap_fit$layout[,1],
UMAP2 = umap_fit$layout[,2],
Cluster = as.factor(result$cluster),
Species = iris$Species)
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Cluster, shape = Species)) +
geom_point(size = 3, alpha = 0.8) +
scale_shape_manual(values = c(16, 17, 15)) +
labs(title = "GMM Clustering on Iris Data") +
theme_minimal()

浙公网安备 33010602011771号