GMM 模型

GMM 模型 EM 算法推导

\(\boldsymbol{x}_i|\boldsymbol{c}_i\sim\mathcal{N}(\boldsymbol{\mu}\boldsymbol{c}_i,\boldsymbol{\Sigma}_{\boldsymbol{c}_i}),\boldsymbol{c}_i\sim \text{Categorial}(\pi_1,\cdots,\pi_K)\),其中 \(\boldsymbol{x}=(\boldsymbol{x}_1,\cdots,\boldsymbol{x}_n)\in\mathbb{R}^{d\times n}\) 为观测变量,参数 \(\boldsymbol{c}=(\boldsymbol{c}_1,\cdots,\boldsymbol{c}_n)\in\mathbb{R}^{K\times n}, \boldsymbol{\mu}=(\boldsymbol{\mu}_1,\cdots,\boldsymbol{\mu}_K)\in\mathbb{R}^{d\times K}\),因此模型参数为 \(\boldsymbol{\theta}=(\pi_k,\boldsymbol{\mu}_k,\boldsymbol{\Sigma}_k,k=1,\cdots,K)\)

对于密度函数 \(q(\boldsymbol{c})\),观测对数似然 \(I_o(\boldsymbol{\theta},\boldsymbol{x})=\log p(\boldsymbol{x};\boldsymbol{\theta})=\log\int p(\boldsymbol{x},\boldsymbol{c};\boldsymbol{\theta})\mathrm{d}\boldsymbol{c}\geqslant\mathbb{E}_q\log\{\frac{p(\boldsymbol{c},\boldsymbol{x};\boldsymbol{\theta})}{q(\boldsymbol{c})}\}:=\operatorname{ELBO}(\boldsymbol{\theta},q)=\mathbb{E}_q\log p(\boldsymbol{c},\boldsymbol{x};\boldsymbol{\theta})-\mathbb{E}_q\log q(\boldsymbol{c})=-\mathrm{KL(q(\boldsymbol{c}),p(\boldsymbol{c}|\boldsymbol{x};\boldsymbol{\theta}))}+\log p(\boldsymbol{x};\boldsymbol{\theta})\)

\(\boldsymbol{c}\) 的后验概率 \(q(\boldsymbol{c}_i=\boldsymbol{1}_k)=p(\boldsymbol{c}_i=\boldsymbol{1}_k|\boldsymbol{x};\boldsymbol{\theta})=p(\boldsymbol{c}_i=\boldsymbol{1}_k|\boldsymbol{x}_i;\boldsymbol{\theta}):=q_{ik}\),那么 \(\displaystyle\operatorname{ELBO}(\boldsymbol{\theta},q)=\sum_i\sum_k q_{ik}\log p(\boldsymbol{x}_i,\boldsymbol{c}_i=\boldsymbol{1}_k;\boldsymbol{\theta})-\sum_i\sum_k q_{ik}\log q_{ik}=\sum_i\sum_k q_{ik}\log (\pi_k\mathcal{N}(\boldsymbol{x}_i;\boldsymbol{\mu}_k,\boldsymbol{\Sigma}_k))-\sum_i\sum_k q_{ik}\log q_{ik},(\hat{\boldsymbol{\theta}},\hat{q})=\argmax_{\sum_k \pi_k=1,\sum_k q_{ik}=1}\operatorname{ELBO}(\boldsymbol{\theta},q)\)

EM 算法即使用块坐标上升算法求解该优化问题:

  • E 步:\(\displaystyle q^r=\argmax_{\sum_k q_{ik}=1}\operatorname{ELBO}(\boldsymbol{\theta}^{r-1},q)\)
    • \(L(q,\boldsymbol{\lambda})=-\operatorname{ELBO}(\boldsymbol{\theta}^{r-1},q)+\sum_i \lambda_i(\sum_k q_{ik}-1)\),令 \(\frac{\partial L(q,\boldsymbol{\lambda})}{\partial q_{ik}}=-\log p(\boldsymbol{x}_i,\boldsymbol{c}_i=\boldsymbol{1}_k;\boldsymbol{\theta}^{r-1})+1+\log q_{ik}+\lambda_i=0\),联立 \(\sum_k q_{ik}=1\)\(\displaystyle q_{ik}^r=\frac{p(\boldsymbol{x}_i,\boldsymbol{c}_i=\boldsymbol{1}_k;\boldsymbol{\theta}^{r-1})}{\sum_j p(\boldsymbol{x}_i,\boldsymbol{c}_i= \boldsymbol{1}_{j};\boldsymbol{\theta}^{r-1})}=\frac{\pi_k^{r-1}\mathcal{N}(\boldsymbol{x}_i;\boldsymbol{\mu}_k^{r-1},\boldsymbol{\Sigma}_k^{r-1})}{\sum_j \pi_j^{r-1}\mathcal{N}(\boldsymbol{x}_i;\boldsymbol{\mu}_j^{r-1},\boldsymbol{\Sigma}_k^{r-1})}\)
  • M 步:\(\displaystyle\boldsymbol{\theta}^r=\argmax_{\sum_k \pi_k=1}\operatorname{ELBO}(\boldsymbol{\theta},q^r)\)
    • \(L(\boldsymbol{\theta},\boldsymbol{\lambda})=-\operatorname{ELBO}(\boldsymbol{\theta},q^r)+ \lambda(\sum_k \pi_{k}-1)\),令

\[\frac{\partial L(\boldsymbol{\theta},\boldsymbol{\lambda})}{\partial\pi_{k}}=-\frac{\sum_i q_{ik}^r}{\pi_k}+\lambda=0,\\ \frac{\partial L(\boldsymbol{\theta},\boldsymbol{\lambda})}{\partial\boldsymbol{\mu}_k}=-\sum_iq_{ik}^r\boldsymbol{\Sigma}_k^{-1}(\boldsymbol{x}_i-\boldsymbol{\mu_k})=\boldsymbol{0},\\ \frac{\partial L(\boldsymbol{\theta},\boldsymbol{\lambda})}{\partial\boldsymbol{\Sigma}_k}=-\frac{1}{2}\sum_iq_{ik}^r\left[\boldsymbol{\Sigma}_k^{-1}(\boldsymbol{x}_i-\boldsymbol{\mu}_k)(\boldsymbol{x}_i-\boldsymbol{\mu}_k)^\top\boldsymbol{\Sigma}_k^{-1} -\boldsymbol{\Sigma}_k^{-1}\right]=\boldsymbol{0}.\]

联立 \(\sum_k \pi_k=1\) 解得

\[\displaystyle\lambda=\frac{1}{n}, \pi_k^r=\frac{\sum_iq_{ik}^r}{n},\\ \boldsymbol{\mu}^r_k=\frac{\sum_iq_{ik}^r\boldsymbol{x}_i}{\sum_iq_{ik}^r},\\ \boldsymbol{\Sigma}_k^r=\frac{\sum_iq_{ik}^r(\boldsymbol{x}_i-\boldsymbol{\mu}_k^r)(\boldsymbol{x}_i-\boldsymbol{\mu}^r_k)^\top}{\sum_iq_{ik}^r}.\\ \]

代码实现

# 加载必要的包
library(mvtnorm)
library(ggplot2)
library(umap)
library(mclust)

data(iris)
X <- as.matrix(iris[, 1:4])
n <- nrow(X)       # 样本量
d <- ncol(X)       # 维度
K <- 3             # 聚类数
true_labels <- as.integer(iris$Species)

# GMM聚类-EM算法
# 初始化
gmm_em <- function(X, K, max_iter = 100, tol = 1e-6) {
  n <- nrow(X)
  d <- ncol(X)
  
  # 初始化参数
  Pi <- rep(1/K, K)
  mu <- X[sample(n, K), ]
  Sigma <- array(diag(d), dim = c(d, d, K))
  elbo <- numeric(max_iter)
  
  for (r in 1:max_iter) {
    # E步:计算q^r
    q <- matrix(1/K, n, K)
    for (k in 1:K) {
      q[, k] <- Pi[k] * dmvnorm(X, mu[k, ], Sigma[, , k])
    }
    q <- q / rowSums(q)  # 归一化
    
    # M步:更新参数 
    # 更新π_k
    Nk <- colSums(q)
    Pi <- Nk / n
    
    # 更新μ_k
    for (k in 1:K) {
      mu[k, ] <- colSums(q[, k] * X) / Nk[k]
    }
    
    # 更新Σ_k
    for (k in 1:K) {
      X_centered <- sweep(X, 2, mu[k, ], "-")
      Sigma[, , k] <- (t(X_centered) %*% (X_centered * q[, k])) / Nk[k]
      Sigma[, , k] <- Sigma[, , k] + diag(1e-16, d)  # 正则化
    }
    
    # 计算ELBO
    for (i in 1:n){
      for (k in 1:K){
        elbo[r] <- elbo[r] + q[i, k] * (log(Pi[k]) + mvtnorm::dmvnorm(X[i, ], mu[k, ], Sigma[, , k], log = TRUE)) - q[i, k] * log(q[i, k] + 1e-16)
      }
    }
    
    # 收敛检查
    if (r > 1 && abs(elbo[r] - elbo[r-1]) < tol) break
  }
  
  list(cluster = apply(q, 1, which.max),
       elbo = elbo[1:r],
       params = list(pi = Pi, mu = mu, Sigma = Sigma))
}

# 运行算法
set.seed(1)
result <- gmm_em(X, K)

conf_matrix <- table(true_labels, result$cluster)
mapping <- apply(conf_matrix, 2, function(x) which.max(x))
result$cluster <- mapping[result$cluster]

cat("Accuracy:", sum(true_labels==result$cluster)/n, "\n")
# 计算Adjusted Rand Index (ARI)
ari <- adjustedRandIndex(true_labels, result$cluster)
cat("Adjusted Rand Index (ARI):", ari, "\n")

# 手动计算NMI
nmi <- function(true_labels, predicted_labels) {
  # 计算混淆矩阵
  conf_matrix <- table(predicted_labels, true_labels)
  
  # 计算熵
  entropy <- function(p) {
    p <- p[p > 0]
    return(-sum(p * log2(p)))
  }
  
  # 计算互信息
  mutual_information <- function(conf_matrix) {
    n <- sum(conf_matrix)
    I <- 0
    for (i in 1:nrow(conf_matrix)) {
      for (j in 1:ncol(conf_matrix)) {
        if (conf_matrix[i, j] > 0) {
          I <- I + conf_matrix[i, j] / n * log2(n * conf_matrix[i, j] / (rowSums(conf_matrix)[i] * colSums(conf_matrix)[j]))
        }
      }
    }
    return(I)
  }
  
  # 计算互信息
  I <- mutual_information(conf_matrix)
  
  # 计算熵
  H_X <- entropy(rowSums(conf_matrix) / sum(conf_matrix))
  H_Y <- entropy(colSums(conf_matrix) / sum(conf_matrix))
  
  # 计算NMI
  nmi_value <- I / sqrt(H_X * H_Y)
  
  return(nmi_value)
}

nmi_value <- nmi(true_labels, result$cluster)
cat("Normalized Mutual Information (NMI):", nmi_value, "\n")

# ELBO可视化
elbo_df <- data.frame(iteration = 1:length(result$elbo), elbo = result$elbo)
ggplot(elbo_df, aes(x = iteration, y = elbo)) +
  geom_line(color = "steelblue") +
  geom_point(color = "darkred") +
  theme_bw()

# UMAP可视化
set.seed(1)
umap_fit <- umap(X)
umap_df <- data.frame(UMAP1 = umap_fit$layout[,1], 
                      UMAP2 = umap_fit$layout[,2],
                      Cluster = as.factor(result$cluster),
                      Species = iris$Species)

ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.8) +
  scale_shape_manual(values = c(16, 17, 15)) +
  labs(title = "GMM Clustering on Iris Data") +
  theme_minimal()

GMM 模型 VB 算法推导

\(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu}\sim\mathcal{N}(\boldsymbol{\mu}\boldsymbol{c}_i,\boldsymbol{I}),\boldsymbol{c}_i\overset{i.i.d.}{\sim} \text{Categorial}(1/K,\cdots,1/K),\boldsymbol{\mu}_k\overset{i.i.d.}{\sim}\mathcal{N}(\boldsymbol{0},\sigma^2\boldsymbol{I}),i=1,\cdots,n,k=1,\cdots,K\),其中 \(\boldsymbol{x}=(\boldsymbol{x}_1,\cdots,\boldsymbol{x}_n)\in\mathbb{R}^{d\times n}\) 为观测变量,随机参数 \(\boldsymbol{c}=(\boldsymbol{c}_1,\cdots,\boldsymbol{c}_n)\in\mathbb{R}^{K\times n}, \boldsymbol{\mu}=(\boldsymbol{\mu}_1,\cdots,\boldsymbol{\mu}_K)\in\mathbb{R}^{d\times K}\)\(\sigma^2,K\) 已知,下面致力于求解随机参数的后验分布 \(p(\boldsymbol{c},\boldsymbol{\mu}|\boldsymbol{x})\)

联合分布 \(p(\boldsymbol{x},\boldsymbol{c},\boldsymbol{\mu})=p(\boldsymbol{x}|\boldsymbol{c},\boldsymbol{\mu})p(\boldsymbol{c})p(\boldsymbol{\mu})=\prod_ip(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu})p(\boldsymbol{c}_i)\prod_kp(\boldsymbol{\mu}_k)\)
证据 \(p(\boldsymbol{x})=\int p(\boldsymbol{x}|\boldsymbol{c},\boldsymbol{\mu})p(\boldsymbol{c})p(\boldsymbol{\mu}) \mathrm{d}\boldsymbol{c}\mathrm{d}\boldsymbol{\mu}\) 难以求解,使用变分推断方法近似后验分布 \(p(\boldsymbol{c},\boldsymbol{\mu}|\boldsymbol{x})\),即 \(\displaystyle q^*(\boldsymbol{c},\boldsymbol{\mu})=\argmin_{q(\boldsymbol{c},\boldsymbol{\mu})\in\mathcal{Q}}\operatorname{KL}(q(\boldsymbol{c},\boldsymbol{\mu}),p(\boldsymbol{c},\boldsymbol{\mu})|\boldsymbol{x})\)

由于 \(\operatorname{KL}(q(\boldsymbol{c},\boldsymbol{\mu}),p(\boldsymbol{c},\boldsymbol{\mu})|\boldsymbol{x})=-\mathbb{E}_q\log p(\boldsymbol{x},\boldsymbol{c},\boldsymbol{\mu})+\mathbb{E}_q \log p(\boldsymbol{c},\boldsymbol{\mu})+\log p(\boldsymbol{x})\),因此问题转化为优化变分下界,即 \(\displaystyle q^*(\boldsymbol{c},\boldsymbol{\mu})=\argmax_{q(\boldsymbol{c},\boldsymbol{\mu})\in\mathcal{Q}}\operatorname{ELBO}(q)=\mathbb{E}_q \log p(\boldsymbol{x},\boldsymbol{c},\boldsymbol{\mu})-\mathbb{E}_q \log q(\boldsymbol{c},\boldsymbol{\mu})\)

设变分分布为 \(q(\boldsymbol{c},\boldsymbol{\mu})=q(\boldsymbol{c})q(\boldsymbol{\mu})\),其中 \(q(\boldsymbol{c})=\prod_i q(\boldsymbol{c}_i;\boldsymbol{\phi}_i)=\prod_i\prod_k\phi_{ik}^{c_{ik}}\)\(q(\boldsymbol{\mu})=\prod_k q(\boldsymbol{\mu}_k;\boldsymbol{m}_k,s_k^2\boldsymbol{I})=\prod_k \mathcal{N}(\boldsymbol{\mu}_k;\boldsymbol{m}_k,s_k^2\boldsymbol{I})\),贝叶斯参数 \(\boldsymbol{\gamma}=(\boldsymbol{m}_k,s_k^2,\phi_{ik},i=1,\cdots,n,k=1,\cdots,K)\)

注意到 \(\displaystyle\log p(\boldsymbol{x},\boldsymbol{c},\boldsymbol{\mu})=\sum_i \big[\log p(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu})+\log p(\boldsymbol{c}_i)\big]+\sum_k \log p(\boldsymbol{\mu}_k),\log q(\boldsymbol{c},\boldsymbol{\mu})=\sum_i\log q(\boldsymbol{c}_i)+\sum_k \log q(\boldsymbol{\mu}_k)\),因此 \(\operatorname{ELBO}(q)=\operatorname{ELBO}(\boldsymbol{\gamma})=\mathbb{E}_q\sum_i \log p(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu})-\mathbb{E}_q\big\{\sum_i\big(\log p(\boldsymbol{c}_i)-\log q(\boldsymbol{c}_i)\big)+\sum_k\big(\log p(\boldsymbol{\mu}_k)-\log q(\boldsymbol{\mu}_k)\big)\bigm\}:=I_1+I_2\)

另外,\(\log p(\boldsymbol{c}_i)=\sum_k c_{ik}\log(1/K),\log q(\boldsymbol{c}_i)=\sum_k c_{ik}\log(\phi_{ik}),\log p(\boldsymbol{\mu}_k)=-\frac{1}{2}(d\log 2\pi+d\log \sigma^2+\boldsymbol{\mu}_k^\top\boldsymbol{\mu}_k/\sigma^2),\log q(\boldsymbol{\mu}_k)=-\frac{1}{2}(d\log 2\pi+d\log s_k^2+(\boldsymbol{\mu}_k-\boldsymbol{m}_k)^\top(\boldsymbol{\mu}_k-\boldsymbol{m}_k)/s_k^2),\log p(\boldsymbol{x}_i|\boldsymbol{c}_i,\boldsymbol{\mu})=-\frac{1}{2}(\boldsymbol{x}_i-\boldsymbol{\mu}\boldsymbol{c}_i)^\top(\boldsymbol{x}_i-\boldsymbol{\mu}\boldsymbol{c}_i)+\operatorname{const}=-\frac{1}{2}\sum_kc_{ik}(\boldsymbol{x}_i-\boldsymbol{\mu}_k)^\top(\boldsymbol{x}_i-\boldsymbol{\mu}_k)+\operatorname{const}\),因此 \(I_2=\sum_k\sum_i \phi_{ik}(\log(1/K)-\log\phi_{ik})+\frac{1}{2}\sum_k(d\log s_k^2+d-d\log \sigma^2-(\boldsymbol{m}_k^\top\boldsymbol{m}_k+ds_k^2)/\sigma^2),I_1=-\frac{1}{2}\sum_i\big[\boldsymbol{x}_i^\top\boldsymbol{x}_i+\sum_k\phi_{ik}(-2\boldsymbol{x}_i^\top\boldsymbol{m}_k+\boldsymbol{m}_k^\top\boldsymbol{m}_k+ds_k^2)\big]\)

至此,问题转化为 \(\displaystyle q^*(\boldsymbol{c},\boldsymbol{\mu})=\argmax_{\boldsymbol{\boldsymbol{\gamma}}}\operatorname{ELBO}(\boldsymbol{\gamma})=-\frac{1}{2}\sum_i\big[\boldsymbol{x}_i^\top\boldsymbol{x}_i+\sum_k\phi_{ik}(-2\boldsymbol{x}_i^\top\boldsymbol{m}_k+\boldsymbol{m}_k^\top\boldsymbol{m}_k+ds_k^2)\big]+\sum_k\sum_i \phi_{ik}(\log(1/K)-\log\phi_{ik})+\frac{1}{2}\sum_k(d\log s_k^2+d-d\log \sigma^2-(\boldsymbol{m}_k^\top\boldsymbol{m}_k+ds_k^2)/\sigma^2)\)

\(\boldsymbol{\theta}=(\boldsymbol{m}_k,{s_k^2},k=1,\cdots,K)\),使用块坐标上升算法求解该优化问题:

  • \(\boldsymbol{c}\)\(\displaystyle \phi_{ik}^r=\argmax_{\sum_k \phi_{ik}=1}\operatorname{ELBO}(\boldsymbol{\theta}^{r-1},\boldsymbol{\phi})\)
    • \(L(\boldsymbol{\phi},\boldsymbol{\lambda})=-\operatorname{ELBO}(\boldsymbol{\theta}^{r-1},\boldsymbol{\phi})+\sum_i \lambda_i(\sum_k \phi_{ik}-1)\),令 \(\displaystyle\frac{\partial L(\boldsymbol{\phi},\boldsymbol{\lambda})}{\partial \phi_{ik}}=\frac{1}{2}(-2\boldsymbol{x}_i^\top\boldsymbol{m}_k^{r-1}+{\boldsymbol{m}_k^{r-1}}^\top\boldsymbol{m}_k^{r-1}+d{s_k^2}^{r-1})-\log(1/K)+1+\log\phi_{ik}+\lambda_i=0\),联立 \(\sum_k \phi_{ik}=1\)\(\displaystyle \phi_{ik}^r=\frac{\operatorname{exp}\{\boldsymbol{x}_i^\top\boldsymbol{m}_k^{r-1}-0.5({\boldsymbol{m}_k^{r-1}}^\top\boldsymbol{m}_k^{r-1}+d{s_k^2}^{r-1})\}}{\sum_j \operatorname{exp}\{\boldsymbol{x}_i^\top\boldsymbol{m}_j^{r-1}-0.5({\boldsymbol{m}_j^{r-1}}^\top\boldsymbol{m}_j^{r-1}+d{s_j^2}^{r-1})}\)
  • \(\boldsymbol{\mu}\)\(\displaystyle\boldsymbol{\theta}^r=\argmax_{\boldsymbol{\theta}}\operatorname{ELBO}(\boldsymbol{\theta},\boldsymbol{\phi}^r)\)

\[\frac{\partial \operatorname{ELBO}(\boldsymbol{\theta},\boldsymbol{\phi}^r)}{\partial\boldsymbol{m}_k}=-\frac{1}{2}\sum_i \phi_{ik}^r(-2\boldsymbol{x}_i+2\boldsymbol{m}_k)-\frac{2\boldsymbol{m}_k}{2\sigma^2}=\boldsymbol{0},\\ \frac{\partial \operatorname{ELBO}(\boldsymbol{\theta},\boldsymbol{\phi}^r)}{\partial s_k^2}=-\frac{1}{2}\sum_i\phi_{ik}^rd+\frac{1}{2}(\frac{d}{s_k^2}-\frac{d}{\sigma^2})=0.\]

解得

\[\displaystyle\boldsymbol{m}^r_k=\frac{\sum_i\phi_{ik}^r\boldsymbol{x}_i}{\sum_i\phi_{ik}^r+1/\sigma^2},\\ {s_k^2}^r=\frac{1}{\sum_i\phi_{ik}^r+1/\sigma^2}.\\ \]

代码实现

# 加载必要的包
library(mvtnorm)
library(ggplot2)
library(umap)
library(mclust)

data(iris)
X <- as.matrix(iris[, 1:4])
n <- nrow(X)       # 样本量
d <- ncol(X)       # 维度
K <- 3             # 聚类数
true_labels <- as.integer(iris$Species)

# GMM聚类-VB算法
# 初始化
gmm_vb <- function(X, K, max_iter = 1000, tol = -1e-6) {
  n <- nrow(X)
  d <- ncol(X)
  
  # 初始化参数
  sigma <- 1
  phi <- matrix(1/K, n, K)
  m <- X[sample(n, K), ]
  s_k2 <- 0.1
  elbo <- numeric(max_iter)
  
  for (r in 1:max_iter) {
    # 计算phi^r
    for (k in 1:K) {
      phi[, k] <- exp( X %*% m[k, ] - as.numeric(0.5*((t(m[k, ]) %*% m[k, ] + d*s_k2)) ))
    }
    phi <- phi / rowSums(phi)  # 归一化
    
    # 更新m_k和s_k2
    Nk <- colSums(phi)
    for (k in 1:K) {
      m[k, ] <- colSums(phi[, k] * X) / (Nk[k]+1/sigma^2)
      s_k2 <- 1 / (Nk[k]+1/sigma^2)
    }
    
    # 计算ELBO
    for (k in 1:K){
      elbo[r] <- elbo[r] + 0.5*(d*log(s_k2)+d-d*log(sigma^2)-((t(m[k, ]) %*% m[k, ]+d*s_k2)/sigma^2))
      for (i in 1:n){
        elbo[r] <- elbo[r] + phi[i, k]*(log(1/K)-log(phi[i, k])) - 0.5*(phi[i, k]*(t(X[i, ]-m[k, ]) %*% (X[i, ]-m[k, ])+d*s_k2))
      }
    }
    
    # 收敛检查
    if (r > 1 && abs(elbo[r] - elbo[r-1]) < tol) break
  }
  
  list(cluster = apply(phi, 1, which.max),
       elbo = elbo[1:r],
       params = list(phi = phi, m = m, s_k2 = s_k2))
}

# 运行算法
set.seed(1)
result <- gmm_vb(X, K)

# 标签调整
conf_matrix <- table(true_labels, result$cluster)
mapping <- apply(conf_matrix, 2, function(x) which.max(x))
result$cluster <- mapping[result$cluster]

cat("Accuracy:", sum(true_labels==result$cluster)/n, "\n")
# 计算Adjusted Rand Index (ARI)
ari <- adjustedRandIndex(true_labels, result$cluster)
cat("Adjusted Rand Index (ARI):", ari, "\n")

# 手动计算NMI
nmi <- function(true_labels, predicted_labels) {
  # 计算混淆矩阵
  conf_matrix <- table(predicted_labels, true_labels)
  
  # 计算熵
  entropy <- function(p) {
    p <- p[p > 0]
    return(-sum(p * log2(p)))
  }
  
  # 计算互信息
  mutual_information <- function(conf_matrix) {
    n <- sum(conf_matrix)
    I <- 0
    for (i in 1:nrow(conf_matrix)) {
      for (j in 1:ncol(conf_matrix)) {
        if (conf_matrix[i, j] > 0) {
          I <- I + conf_matrix[i, j] / n * log2(n * conf_matrix[i, j] / (rowSums(conf_matrix)[i] * colSums(conf_matrix)[j]))
        }
      }
    }
    return(I)
  }
  
  # 计算互信息
  I <- mutual_information(conf_matrix)
  
  # 计算熵
  H_X <- entropy(rowSums(conf_matrix) / sum(conf_matrix))
  H_Y <- entropy(colSums(conf_matrix) / sum(conf_matrix))
  
  # 计算NMI
  nmi_value <- I / sqrt(H_X * H_Y)
  
  return(nmi_value)
}

nmi_value <- nmi(true_labels, result$cluster)
cat("Normalized Mutual Information (NMI):", nmi_value, "\n")

# ELBO可视化
elbo_df <- data.frame(iteration = 1:length(result$elbo), elbo = result$elbo)
ggplot(elbo_df, aes(x = iteration, y = elbo)) +
  geom_line(color = "steelblue") +
  geom_point(color = "darkred") +
  theme_bw()

# UMAP可视化
set.seed(1)
umap_fit <- umap(X)
umap_df <- data.frame(UMAP1 = umap_fit$layout[,1], 
                      UMAP2 = umap_fit$layout[,2],
                      Cluster = as.factor(result$cluster),
                      Species = iris$Species)

ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.8) +
  scale_shape_manual(values = c(16, 17, 15)) +
  labs(title = "GMM Clustering on Iris Data") +
  theme_minimal()
posted @ 2025-05-08 20:26  |烟岚云岫|  阅读(43)  评论(0)    收藏  举报