【数据分析&数据挖掘】非数值型数据的哑变量转化、连续型数据离散化——等宽分组&等频分组

 1 import pandas as pd
 2 import numpy as np
 3 
 4 # 加载数据
 5 detail = pd.read_excel("../day05/meal_order_detail.xlsx")
 6 # print("detail ：", detail)
 7 print("detail 的列索引：", detail.columns)
 8 
 9 # 将dishes_name 转化为数值型数据
10 # 哑变量矩阵转化
11 res = pd.get_dummies(
12     data=detail.loc[:,"dishes_name"],
13     prefix_sep="_",
14     prefix="菜品"
15 )
16 print("转化之后的结果res:\n",res)
17 # res.to_csv("./hh.csv")
18 
19 
20 # 身高 150 - 190  每位同学 都是一个具体的身高---连续的小数
21 # 将连续型数据转化为类别数据 ----离散化
22 # 分组
23 print("菜品单价的最大值与最小值：", detail.loc[:, "amounts"].max(), detail.loc[:, "amounts"].min())
24 # 将detail 里面的amounts 数据进行离散化
25 # detail.loc[:, "amounts"] = pd.cut(detail.loc[:, "amounts"], bins=5)
26 
27 # 自定义分组
28 # # 等宽分组
29 # # 1、指定分组个数
30 group_num = 5
31 # # # 2、计算最大值与最小值的极差
32 ptp = detail.loc[:, "amounts"].max() - detail.loc[:, "amounts"].min()
33 # # # 3、确定步长
34 step = int(np.ceil(ptp / group_num))
35 # # # 4、确定分组的区间的节点
36 bins = np.arange(detail.loc[:, "amounts"].min(), detail.loc[:, "amounts"].max() + step, step)
37 print(bins)
38 # # 5、指定自定义分组
39 # # include_lowest ---指定包含最小值
40 detail.loc[:, "amounts"] = pd.cut(detail.loc[:, "amounts"], bins=bins, include_lowest=True)
41 
42 # 等频分组
43 # 1、计算分位数
44 # bins = detail.loc[:, "amounts"].quantile(q=np.arange(0, 1 + 1 / 5, 1 / 5))
45 # print(bins)
46 # # include_lowest ---指定包含最小值
47 # detail.loc[:, "amounts"] = pd.cut(detail.loc[:, "amounts"], bins=bins, include_lowest=True)
48 #
49 # print(detail.loc[:, "amounts"])
50 # #
51 # # 统计每一个组内的个数
52 # res_counts = pd.value_counts(detail.loc[:, "amounts"])
53 # print("res_counts:\n", res_counts)

发表于 2019-12-29 19:50 可西可彻阅读(840) 评论(0) 编辑收藏举报

公告