PCA降维

读取数据

import pandas as pd
import openpyxl
import numpy as np
data = pd.read_excel("E:\jupyter_root_directory\data/我国大陆经济发展状况数据.xlsx",header=None,engine='openpyxl')
data = data [2:]
data = data[[1,2,3,4,5,6,7,8]]
data
 12345678
2 1394.89 2505 519.01 8144 373.9 117.3 112.6 843.43
3 920.11 2720 345.46 6501 342.8 115.2 110.6 582.51
4 2849.52 1258 704.87 4839 2033.3 115.2 115.8 1234.85
5 1092.48 1250 290.9 4721 717.3 116.9 115.6 697.25
6 832.88 1387 250.23 4134 781.7 117.5 116.8 419.39
7 2793.37 2397 387.99 4911 1371.1 116.1 114 1840.55
8 1129.2 1872 320.45 4430 497.4 115.2 114.2 762.47
9 2014.53 2334 435.73 4145 824.8 116.1 114.3 1240.37
10 2462.57 5343 996.48 9279 207.4 118.7 113 1642.95
11 5155.25 1926 1434.95 5934 1025.5 115.8 114.3 2026.64
12 3524.79 2249 1006.39 6619 754.4 116.6 113.5 916.59
13 2003.58 1254 474 4609 908.3 114.8 112.7 824.14
14 2160.52 2320 553.97 5857 609.3 115.2 114.4 433.67
15 1205.1 1182 282.84 4211 411.7 116.9 115.9 571.84
16 5002.34 1527 1229.55 5145 1196.6 117.6 114.2 2207.69
17 3002.74 1034 670.35 4344 1574.4 116.5 114.9 1367.92
18 2391.42 1527 571.68 4685 849 120 116.6 1220.72
19 2195.7 1408 422.61 4797 1011.8 119 115.5 843.83
20 5381.72 2699 1639.83 8250 656.5 114 111.6 1396.35
21 1606.15 1314 382.59 5150 556 118.4 116.4 554.97
22 364.17 1814 198.35 5340 232.1 113.5 111.3 64.33
23 3534 1261 822.54 4645 902.3 118.5 117 1431.81
24 630.07 942 150.84 4475 301.1 121.4 117.2 324.72
25 1206.68 1261 334 5149 310.4 121.3 118.1 716.65
26 55.98 1110 17.87 7382 4.2 117.3 114.9 5.57
27 1000.03 1208 300.27 4396 500.9 119 117 600.98
28 553.35 1007 114.81 5493 507 119.8 116.5 468.79
29 165.31 1445 47.76 5753 61.6 118 116.3 105.8
30 169.75 1355 61.98 5079 121.8 117.1 115.3 114.4
31 834.57 1469 376.95 5348 339 119.7 116.7 428.76

 

去中心化:每个数据减去对应每列的平均值

1 sample,feature=data.shape
2 data = data - np.mean(data)
3 data
 12345678
2 -526.202 759.067 7.50167 2685.17 -292.22 0.0133333 -2.30667 -19.568
3 -1000.98 974.067 -166.048 1042.17 -323.32 -2.08667 -4.30667 -280.488
4 928.428 -487.933 193.362 -619.833 1367.18 -2.08667 0.893333 371.852
5 -828.612 -495.933 -220.608 -737.833 51.18 -0.386667 0.693333 -165.748
6 -1088.21 -358.933 -261.278 -1324.83 115.58 0.213333 1.89333 -443.608
7 872.278 651.067 -123.518 -547.833 704.98 -1.18667 -0.906667 977.552
8 -791.892 126.067 -191.058 -1028.83 -168.72 -2.08667 -0.706667 -100.528
9 93.4377 588.067 -75.7783 -1313.83 158.68 -1.18667 -0.606667 377.372
10 541.478 3597.07 484.972 3820.17 -458.72 1.41333 -1.90667 779.952
11 3234.16 180.067 923.442 475.167 359.38 -1.48667 -0.606667 1163.64
12 1603.7 503.067 494.882 1160.17 88.28 -0.686667 -1.40667 53.592
13 82.4877 -491.933 -37.5083 -849.833 242.18 -2.48667 -2.20667 -38.858
14 239.428 574.067 42.4617 398.167 -56.82 -2.08667 -0.506667 -429.328
15 -715.992 -563.933 -228.668 -1247.83 -254.42 -0.386667 0.993333 -291.158
16 3081.25 -218.933 718.042 -313.833 530.48 0.313333 -0.706667 1344.69
17 1081.65 -711.933 158.842 -1114.83 908.28 -0.786667 -0.00666667 504.922
18 470.328 -218.933 60.1717 -773.833 182.88 2.71333 1.69333 357.722
19 274.608 -337.933 -88.8983 -661.833 345.68 1.71333 0.593333 -19.168
20 3460.63 953.067 1128.32 2791.17 -9.62 -3.28667 -3.30667 533.352
21 -314.942 -431.933 -128.918 -308.833 -110.12 1.11333 1.49333 -308.028
22 -1556.92 68.0667 -313.158 -118.833 -434.02 -3.78667 -3.60667 -798.668
23 1612.91 -484.933 311.032 -813.833 236.18 1.21333 2.09333 568.812
24 -1291.02 -803.933 -360.668 -983.833 -365.02 4.11333 2.29333 -538.278
25 -714.412 -484.933 -177.508 -309.833 -355.72 4.01333 3.19333 -146.348
26 -1865.11 -635.933 -493.638 1923.17 -661.92 0.0133333 -0.00666667 -857.428
27 -921.062 -537.933 -211.238 -1062.83 -165.22 1.71333 2.09333 -262.018
28 -1367.74 -738.933 -396.698 34.1667 -159.12 2.51333 1.59333 -394.208
29 -1755.78 -300.933 -463.748 294.167 -604.52 0.713333 1.39333 -757.198
30 -1751.34 -390.933 -449.528 -379.833 -544.32 -0.186667 0.393333 -748.598
31 -1086.52 -276.933 -134.558 -110.833 -327.12 2.41333 1.79333 -434.238
计算协方差矩阵:直接调用方法即可
1 data1 = np.mat(data)
2 data1 = data1.astype('float16')
3 
4 # 计算协方差矩阵
5 covX = np.cov(data1.T)
6 covX
array([[ 2.17512816e+06,  3.39017180e+05,  5.64795310e+05,
         3.66799624e+05,  4.18740435e+05, -8.14159678e+02,
        -7.37804742e+02,  7.53426315e+05],
       [ 3.39017180e+05,  7.42673545e+05,  1.47954656e+05,
         8.10174225e+05, -5.98597476e+04, -4.10269647e+02,
        -9.69473039e+02,  1.82942939e+05],
       [ 5.64795310e+05,  1.47954656e+05,  1.62302951e+05,
         2.10470018e+05,  7.98055667e+04, -2.28857444e+02,
        -2.74499481e+02,  1.86527121e+05],
       [ 3.66799624e+05,  8.10174225e+05,  2.10470018e+05,
         1.71571948e+06, -2.14593340e+05, -3.56041630e+02,
        -1.33849453e+03,  7.91434689e+04],
       [ 4.18740435e+05, -5.98597476e+04,  7.98055667e+04,
        -2.14593340e+05,  2.11547288e+05, -2.35784325e+02,
         1.89958719e+01,  1.77085901e+05],
       [-8.14159678e+02, -4.10269647e+02, -2.28857444e+02,
        -3.56041630e+02, -2.35784325e+02,  4.10102506e+00,
         2.93249666e+00, -1.48336671e+02],
       [-7.37804742e+02, -9.69473039e+02, -2.74499481e+02,
        -1.33849453e+03,  1.89958719e+01,  2.93249666e+00,
         3.60350766e+00, -2.13093770e+02],
       [ 7.53426315e+05,  1.82942939e+05,  1.86527121e+05,
         7.91434689e+04,  1.77085901e+05, -1.48336671e+02,
        -2.13093770e+02,  3.41794042e+05]])
 求特征值和特征向量:调用专有的方法
1 eig_val, eig_vec = np.linalg.eig(covX)
2 eig_pairs = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(feature)]
3 eig_val
1 array([3.00989343e+06, 1.90990488e+06, 3.00900997e+05, 8.22843300e+04,
2        4.08689937e+04, 5.31590865e+03, 4.22552725e+00, 4.17682352e-01])
对特征值从大到小排序
1 index = np.argsort(-eig_val)
2 # 对特征值从大到小排序,
3 np.argsort(eig_val)

降维

k = 3
selectVec = np.matrix(eig_vec.T[index[:k]])
finalData = data1 * selectVec.T # (30, 8) * (8, 3) = (30, 3)
finalData.shape
finalData
 1 matrix([[  991.02791011, -2598.05143659,  -442.07545458],
 2         [ -145.34532529, -1734.65887357,   457.16158882],
 3         [  557.61661571,  1483.60034716,   -93.14172337],
 4         [-1197.70506441,   311.50193131,    20.75043138],
 5         [-1691.41478306,   542.6983315 ,   364.01896739],
 6         [  925.58426428,   922.66694957,   961.42758107],
 7         [-1105.78714784,   282.27855119,   652.96927436],
 8         [ -230.8764751 ,   915.2909667 ,  1154.93539315],
 9         [ 3479.73362389, -3833.86882421,  1432.59322724],
10         [ 3279.28517562,  1443.82207397,  -340.60839295],
11         [ 2020.42780485,  -236.33723265,  -368.15435258],
12         [ -459.95372537,   899.6555233 ,   -51.4321421 ],
13         [  421.88446529,  -480.08094189,   143.32609056],
14         [-1424.33703514,   671.74965252,    96.75028233],
15         [ 2712.10215068,  2166.91458621,  -234.05003709],
16         [  370.55219993,  1915.41058406,  -103.89839129],
17         [   75.40542303,   988.63141429,   171.25143746],
18         [ -179.38146112,   819.30680274,   -18.32663648],
19         [ 4553.43459221,  -648.74796439,  -936.50316474],
20         [ -631.0710909 ,   148.57756477,  -260.22529535],
21         [-1543.48297854,  -931.20547131,   148.74331019],
22         [  963.19113519,  1714.01295905,  -193.78601902],
23         [-1924.84798562,   212.88195738,  -181.92664169],
24         [ -942.34945313,   -52.02992477,  -208.50404981],
25         [-1165.55041791, -2423.78738221, -1276.80765054],
26         [-1469.95701957,   456.57292109,    92.47839383],
27         [-1461.05431309,  -532.72325834,  -483.85639992],
28         [-1662.97477831, -1243.25114115,  -294.80964414],
29         [-1974.30737332,  -688.33978176,   -78.22032709],
30         [-1139.6165708 ,  -491.32802526,  -129.98141111]])

 

posted @ 2022-11-03 15:29  僵尸棋  阅读(100)  评论(0编辑  收藏  举报