# 主成分分析PCA数据降维原理及python应用（葡萄酒案例分析）

## 1、认识PCA

### （2）方法步骤

1. 标准化d维数据集
2. 构建协方差矩阵。
3. 将协方差矩阵分解为特征向量和特征值。
4. 对特征值进行降序排列，相应的特征向量作为整体降序。
5. 选择k个最大特征值的特征向量，$\dpi{100} k< < d$
6. 根据提取的k个特征向量构造投影矩阵$\dpi{100} \bg_black \large W$
7. d维数据经过$\dpi{100} \bg_black \large W$变换获得k维。

## 2、提取主成分

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# df_wine=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',header=None)#服务器加载

# split the data，train：test=7:3
x, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=0)

# standardize the feature 标准化
sc = StandardScaler()
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.fit_transform(x_test)

$\inline \dpi{100} \LARGE \sigma _{jk}=\frac{1}{n}\sum_{i=1}^{n}(x_{j}^{(i)}-\mu _{j})(x_{k}^{(i)}-\mu _{k})$

$\inline \sum =\begin{bmatrix} \sigma _{1}^{2} \sigma _{12} \sigma _{13}\\ \sigma _{21} \sigma _{2}^{2} \sigma _{23}\\ \sigma _{31} \sigma _{32} \sigma _{3}^{2} \end{bmatrix}$

# 构造协方差矩阵，得到特征向量和特征值
cov_matrix = np.cov(x_train_std.T)
eigen_val, eigen_vec = np.linalg.eig(cov_matrix)
# print("values\n ", eigen_val, "\nvector\n ", eigen_vec)# 可以打印看看

## 3、主成分方差可视化

$\inline \LARGE \frac{\lambda _{j}}{\sum_{j=1}^{d}\lambda _{j}}$

# 解释方差比
tot = sum(eigen_val)  # 总特征值和
var_exp = [(i / tot) for i in sorted(eigen_val, reverse=True)]  # 计算解释方差比，降序
# print(var_exp)
cum_var_exp = np.cumsum(var_exp)  # 累加方差比率
plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文
plt.bar(range(1, 14), var_exp, alpha=0.5, align='center', label='独立解释方差')  # 柱状 Individual_explained_variance
plt.step(range(1, 14), cum_var_exp, where='mid', label='累加解释方差')  # Cumulative_explained_variance
plt.ylabel("解释方差率")
plt.xlabel("主成分索引")
plt.legend(loc='right')
plt.show()

## 4、特征变换

# 特征变换
eigen_pairs = [(np.abs(eigen_val[i]), eigen_vec[:, i]) for i in range(len(eigen_val))]
eigen_pairs.sort(key=lambda k: k[0], reverse=True)  # (特征值，特征向量)降序排列

w = np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs[1][1][:, np.newaxis]))  # 降维投影矩阵W

13*2维矩阵如下：

x_train_pca = x_train_std.dot(w)

## 5、数据分类结果

color = ['r', 'g', 'b']
marker = ['s', 'x', 'o']
for l, c, m in zip(np.unique(y_train), color, marker):
plt.scatter(x_train_pca[y_train == l, 0],
x_train_pca[y_train == l, 1],
c=c, label=l, marker=m)
plt.title('Result')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(loc='lower left')
plt.show()

## 6、完整代码

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def main():

# split the data，train：test=7:3
x, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=0)

# standardize the feature 标准化单位方差
sc = StandardScaler()
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.fit_transform(x_test)
# print(x_train_std)

# 构造协方差矩阵，得到特征向量和特征值
cov_matrix = np.cov(x_train_std.T)
eigen_val, eigen_vec = np.linalg.eig(cov_matrix)
# print("values\n ", eigen_val, "\nvector\n ", eigen_vec)

# 解释方差比
tot = sum(eigen_val)  # 总特征值和
var_exp = [(i / tot) for i in sorted(eigen_val, reverse=True)]  # 计算解释方差比，降序
# print(var_exp)
# cum_var_exp = np.cumsum(var_exp)  # 累加方差比率
# plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文
# plt.bar(range(1, 14), var_exp, alpha=0.5, align='center', label='独立解释方差')  # 柱状 Individual_explained_variance
# plt.step(range(1, 14), cum_var_exp, where='mid', label='累加解释方差')  # Cumulative_explained_variance
# plt.ylabel("解释方差率")
# plt.xlabel("主成分索引")
# plt.legend(loc='right')
# plt.show()

# 特征变换
eigen_pairs = [(np.abs(eigen_val[i]), eigen_vec[:, i]) for i in range(len(eigen_val))]
eigen_pairs.sort(key=lambda k: k[0], reverse=True)  # (特征值，特征向量)降序排列
# print(eigen_pairs)
w = np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs[1][1][:, np.newaxis]))  # 降维投影矩阵W
# print(w)
x_train_pca = x_train_std.dot(w)
# print(x_train_pca)
color = ['r', 'g', 'b']
marker = ['s', 'x', 'o']
for l, c, m in zip(np.unique(y_train), color, marker):
plt.scatter(x_train_pca[y_train == l, 0],
x_train_pca[y_train == l, 1],
c=c, label=l, marker=m)
plt.title('Result')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(loc='lower left')
plt.show()

if __name__ == '__main__':
main()
View Code

## 总结：

---------------------------------------------------------书上有路，学海无涯。 生活总是很忙碌，也许这才是生活真正的奥秘。--------------------------------------------------------- 作者：Charzueus 来源：博客园 本博文版权归作者所有！ 禁止商业转载等用途或联系作者授权，非商业转载请注明出处！ 版权声明：本文为博主原创文章，转载请附上原文出处链接和本声明。
posted @ 2020-08-10 22:03  Charzueus  阅读(4773)  评论(0编辑  收藏  举报