1 from sklearn.datasets import load_iris
2 from sklearn.model_selection import train_test_split
3 from sklearn.feature_extraction import DictVectorizer
4 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
5 from sklearn.preprocessing import MinMaxScaler, StandardScaler
6 from sklearn.feature_selection import VarianceThreshold
7 from sklearn.decomposition import PCA
8 from scipy.stats import pearsonr
9 import jieba
10 import pandas as pd
11
12
13 def datasets_demo():
14 """
15 sklearn数据集使用
16 :return:
17 """
18 # 获取数据集
19 iris = load_iris()
20 print("鸢尾花数据集:\n", iris)
21 print("查看数据集描述:\n", iris["DESCR"])
22 print("查看特征值的名字:\n", iris.feature_names)
23 print("查看特征值:\n", iris.data, iris.data.shape)
24
25 # 数据集划分
26 x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
27 print("训练集的特征值:\n", x_train, x_train.shape)
28
29 return None
30
31
32 def dict_demo():
33 """
34 字典特征抽取
35 :return:
36 """
37 data = [{'city': '北京','temperature':100}, {'city': '上海','temperature':60}, {'city': '深圳','temperature':30}]
38 # 1、实例化一个转换器类
39 transfer = DictVectorizer(sparse=True)
40
41 # 2、调用fit_transform()
42 data_new = transfer.fit_transform(data)
43 print("data_new:\n", data_new.toarray(), type(data_new))
44 print("特征名字:\n", transfer.get_feature_names())
45
46 return None
47
48
49 def count_demo():
50 """
51 文本特征抽取:CountVecotrizer
52 :return:
53 """
54 data = ["life is short,i like like python", "life is too long,i dislike python"]
55 # 1、实例化一个转换器类
56 transfer = CountVectorizer(stop_words=["is", "too"])
57
58 # 2、调用fit_transform
59 data_new = transfer.fit_transform(data)
60 print("data_new:\n", data_new.toarray())
61 print("特征名字:\n", transfer.get_feature_names())
62
63 return None
64
65 def count_chinese_demo():
66 """
67 中文文本特征抽取:CountVecotrizer
68 :return:
69 """
70 data = ["我 爱 北京 天安门", "天安门 上 太阳 升"]
71 # 1、实例化一个转换器类
72 transfer = CountVectorizer()
73
74 # 2、调用fit_transform
75 data_new = transfer.fit_transform(data)
76 print("data_new:\n", data_new.toarray())
77 print("特征名字:\n", transfer.get_feature_names())
78
79 return None
80
81
82 def cut_word(text):
83 """
84 进行中文分词:"我爱北京天安门" --> "我 爱 北京 天安门"
85 :param text:
86 :return:
87 """
88 return " ".join(list(jieba.cut(text)))
89
90
91 def count_chinese_demo2():
92 """
93 中文文本特征抽取,自动分词
94 :return:
95 """
96 # 将中文文本进行分词
97 data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
98 "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
99 "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
100
101 data_new = []
102 for sent in data:
103 data_new.append(cut_word(sent))
104 # print(data_new)
105 # 1、实例化一个转换器类
106 transfer = CountVectorizer(stop_words=["一种", "所以"])
107
108 # 2、调用fit_transform
109 data_final = transfer.fit_transform(data_new)
110 print("data_new:\n", data_final.toarray())
111 print("特征名字:\n", transfer.get_feature_names())
112
113 return None
114
115 def tfidf_demo():
116 """
117 用TF-IDF的方法进行文本特征抽取
118 :return:
119 """
120 # 将中文文本进行分词
121 data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
122 "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
123 "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
124
125 data_new = []
126 for sent in data:
127 data_new.append(cut_word(sent))
128 # print(data_new)
129 # 1、实例化一个转换器类
130 transfer = TfidfVectorizer(stop_words=["一种", "所以"])
131
132 # 2、调用fit_transform
133 data_final = transfer.fit_transform(data_new)
134 print("data_new:\n", data_final.toarray())
135 print("特征名字:\n", transfer.get_feature_names())
136
137 return None
138
139 def minmax_demo():
140 """
141 归一化
142 :return:
143 """
144 # 1、获取数据
145 data = pd.read_csv("dating.txt")
146 data = data.iloc[:, :3]
147 print("data:\n", data)
148
149 # 2、实例化一个转换器类
150 transfer = MinMaxScaler(feature_range=[2, 3])
151
152 # 3、调用fit_transform
153 data_new = transfer.fit_transform(data)
154 print("data_new:\n", data_new)
155
156 return None
157
158
159 def stand_demo():
160 """
161 标准化
162 :return:
163 """
164 # 1、获取数据
165 data = pd.read_csv("dating.txt")
166 data = data.iloc[:, :3]
167 print("data:\n", data)
168
169 # 2、实例化一个转换器类
170 transfer = StandardScaler()
171
172 # 3、调用fit_transform
173 data_new = transfer.fit_transform(data)
174 print("data_new:\n", data_new)
175 return None
176
177 def variance_demo():
178 """
179 过滤低方差特征
180 :return:
181 """
182 # 1、获取数据
183 data = pd.read_csv("factor_returns.csv")
184 data = data.iloc[:, 1:-2]
185 print("data:\n", data)
186
187 # 2、实例化一个转换器类
188 transfer = VarianceThreshold(threshold=10)
189
190 # 3、调用fit_transform
191 data_new = transfer.fit_transform(data)
192 print("data_new:\n", data_new, data_new.shape)
193
194 # 计算某两个变量之间的相关系数
195 r1 = pearsonr(data["pe_ratio"], data["pb_ratio"])
196 print("相关系数:\n", r1)
197 r2 = pearsonr(data['revenue'], data['total_expense'])
198 print("revenue与total_expense之间的相关性:\n", r2)
199
200 return None
201
202
203 def pca_demo():
204 """
205 PCA降维
206 :return:
207 """
208 data = [[2,8,4,5], [6,3,0,8], [5,4,9,1]]
209
210 # 1、实例化一个转换器类
211 transfer = PCA(n_components=0.95)
212
213 # 2、调用fit_transform
214 data_new = transfer.fit_transform(data)
215 print("data_new:\n", data_new)
216 return None
217
218 if __name__ == "__main__":
219 # 代码1:sklearn数据集使用
220 # datasets_demo()
221 # 代码2:字典特征抽取
222 # dict_demo()
223 # 代码3:文本特征抽取:CountVecotrizer
224 # count_demo()
225 # 代码4:中文文本特征抽取:CountVecotrizer
226 # count_chinese_demo()
227 # 代码5:中文文本特征抽取,自动分词
228 # count_chinese_demo2()
229 # 代码6:中文分词
230 # print(cut_word("我爱北京天安门"))
231 # 代码7:用TF-IDF的方法进行文本特征抽取
232 # tfidf_demo()
233 # 代码8:归一化
234 # minmax_demo()
235 # 代码9:标准化
236 # stand_demo()
237 # 代码10:低方差特征过滤
238 # variance_demo()
239 # 代码11:PCA降维
240 pca_demo()