python数据挖掘作业____商品间/商品内分析

 1 # -*- coding: utf-8 -*-
 2 
 3 # 代码8-1 查看数据特征
 4 
 5 import numpy as np
 6 import pandas as pd
 7 
 8 inputfile = 'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv'   # 输入的数据文件
 9 data = pd.read_csv(inputfile,encoding = 'gbk')  # 读取数据
10 data .info()  # 查看数据属性
11 
12 data = data['id']
13 description = [data.count(),data.min(), data.max()]  # 依次计算总数、最小值、最大值
14 description = pd.DataFrame(description, index = ['Count','Min', 'Max']).T  # 将结果存入数据框
15 print('描述性统计结果：\n',np.round(description))  # 输出结果

1 # 销量排行前10商品的销量及其占比
2 import pandas as pd
3 inputfile = 'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv'  # 输入的数据文件
4 data = pd.read_csv(inputfile,encoding = 'gbk')  # 读取数据
5 group = data.groupby(['Goods']).count().reset_index()  # 对商品进行分类汇总
6 sorted=group.sort_values('id',ascending=False)
7 print('销量排行前10商品的销量:\n', sorted[:10])  # 排序并查看前10位热销商品

 1 # 画条形图展示出销量排行前10商品的销量
 2 import matplotlib.pyplot as plt
 3 x=sorted[:10]['Goods']
 4 y=sorted[:10]['id']
 5 plt.figure(figsize = (8, 4))  # 设置画布大小 
 6 plt.barh(x,y)
 7 plt.rcParams['font.sans-serif'] = 'SimHei'
 8 plt.xlabel('销量')  # 设置x轴标题
 9 plt.ylabel('商品类别')  # 设置y轴标题
10 plt.title('商品的销量TOP10--3009')  # 设置标题
11 plt.savefig('D://CourseAssignment//AI//GoodSellMod//tmp//top10.png')  # 把图片以.png格式保存
12 plt.show()  # 展示图片

1 # 销量排行前10商品的销量占比
2 data_nums = data.shape[0]
3 for idnex, row in sorted[:10].iterrows():
4     print(row['Goods'],row['id'],row['id']/data_nums)

 1 # 代码8-3 各类别商品的销量及其占比
 2 
 3 import pandas as pd
 4 inputfile1 = 'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv'
 5 inputfile2 = 'D://CourseAssignment//AI//GoodSellMod//GoodsTypes.csv'
 6 data = pd.read_csv(inputfile1,encoding = 'gbk')
 7 types = pd.read_csv(inputfile2,encoding = 'gbk')  # 读入数据
 8 
 9 group = data.groupby(['Goods']).count().reset_index()
10 sort = group.sort_values('id',ascending = False).reset_index()
11 data_nums = data.shape[0]  # 总量
12 del sort['index']
13 
14 sort_links = pd.merge(sort,types)  # 合并两个datafreame 根据type
15 # 根据类别求和，每个商品类别的总量，并排序
16 sort_link = sort_links.groupby(['Types']).sum().reset_index()
17 sort_link = sort_link.sort_values('id',ascending = False).reset_index()
18 del sort_link['index']  # 删除“index”列
19 
20 # 求百分比，然后更换列名，最后输出到文件
21 sort_link['count'] = sort_link.apply(lambda line: line['id']/data_nums,axis=1)
22 sort_link.rename(columns = {'count':'percent'},inplace = True)
23 print('各类别商品的销量及其占比:\n',sort_link)
24 outfile1 = 'D://CourseAssignment//AI//GoodSellMod//tmp//percent.csv'
25 sort_link.to_csv(outfile1,index = False,header = True,encoding='gbk')  # 保存结果

 1 # 画饼图展示每类商品销量占比
 2 import matplotlib.pyplot as plt
 3 data = sort_link['percent']
 4 labels = sort_link['Types']
 5 plt.figure(figsize=(8, 6))  # 设置画布大小   
 6 plt.pie(data,labels=labels,autopct='%1.2f%%')
 7 plt.rcParams['font.sans-serif'] = 'SimHei'
 8 plt.title('每类商品销量占比--3009')  # 设置标题
 9 plt.savefig('D://CourseAssignment//AI//GoodSellMod//tmp//persent.png')  # 把图片以.png格式保存
10 plt.show()

 1 # 代码8-4 非酒精饮料内部商品的销量及其占比
 2 
 3 # 先筛选“非酒精饮料”类型的商品，然后求百分比，然后输出结果到文件。
 4 selected = sort_links.loc[sort_links['Types'] == '非酒精饮料']  # 挑选商品类别为“非酒精饮料”并排序
 5 child_nums = selected['id'].sum()  # 对所有的“非酒精饮料”求和
 6 selected['child_percent'] = selected.apply(lambda line: line['id']/child_nums,axis = 1)  # 求百分比
 7 selected.rename(columns = {'id':'count'},inplace = True)
 8 print('非酒精饮料内部商品的销量及其占比:\n',selected)
 9 outfile2 = 'D://CourseAssignment//AI//GoodSellMod//tmp//child_percent.csv'
10 sort_link.to_csv(outfile2,index = False,header = True,encoding='gbk')  # 输出结果

 1 # 画饼图展示非酒精饮品内部各商品的销量占比
 2 import matplotlib.pyplot as plt
 3 data = selected['child_percent']
 4 labels = selected['Goods']
 5 plt.figure(figsize = (8,6))  # 设置画布大小 
 6 explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3)  # 设置每一块分割出的间隙大小
 7 plt.pie(data,explode = explode,labels = labels,autopct = '%1.2f%%',
 8         pctdistance = 1.1,labeldistance = 1.2)
 9 plt.rcParams['font.sans-serif'] = 'SimHei'
10 plt.title("非酒精饮料内部各商品的销量占比--3009")  # 设置标题
11 plt.axis('equal')
12 plt.savefig('D://CourseAssignment//AI//GoodSellMod//tmp//child_persent.png')  # 保存图形
13 plt.show()  # 展示图形

1 # 先筛选“西点”类型的商品，然后求百分比，然后输出结果到文件。
2 selected = sort_links.loc[sort_links['Types'] == '西点']  # 挑选商品类别为“非酒精饮料”并排序
3 child_nums = selected['id'].sum()  # 对所有的“非酒精饮料”求和
4 selected['child_percent'] = selected.apply(lambda line: line['id']/child_nums,axis = 1)  # 求百分比
5 selected.rename(columns = {'id':'count'},inplace = True)
6 print('西点内部商品的销量及其占比:\n',selected)
7 outfile3 = 'D://CourseAssignment//AI//GoodSellMod//tmp//bread_precent.csv'
8 sort_link.to_csv(outfile3,index = False,header = True,encoding='gbk')  # 输出结果

 1 # 画饼图展示西点内部各商品的销量占比
 2 import matplotlib.pyplot as plt
 3 data = selected['child_percent']
 4 labels = selected['Goods']
 5 plt.figure(figsize = (8,6))  # 设置画布大小
 6 explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3)  # 设置每一块分割出的间隙大小
 7 plt.pie(data,explode = None,labels = labels,autopct = '%1.2f%%',
 8         pctdistance = 1.1,labeldistance = 1.2)
 9 plt.rcParams['font.sans-serif'] = 'SimHei'
10 plt.title("西点内部各商品的销量占比--3009")  # 设置标题
11 plt.axis('equal')
12 plt.savefig('D://CourseAssignment//AI//GoodSellMod//tmp//bread_precent.png')  # 保存图形
13 plt.show()  # 展示图形

  1 # -*- coding: utf-8 -*-
  2 import pandas as pd
  3 inputfile='D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv'
  4 data = pd.read_csv(inputfile,encoding = 'gbk')
  5 
  6 # 根据id对“Goods”列合并，并使用“，”将各商品隔开
  7 data['Goods'] = data['Goods'].apply(lambda x:','+x)
  8 data = data.groupby('id').sum().reset_index()
  9 
 10 # 对合并的商品列转换数据格式
 11 data['Goods'] = data['Goods'].apply(lambda x :[x[1:]])
 12 data_list = list(data['Goods'])
 13 
 14 # 分割商品名为每个元素
 15 data_translation = []
 16 for i in data_list:
 17     p = i[0].split(',')
 18     data_translation.append(p)
 19 for i in range(9):
 20     print('数据转换结果的第 ',i+1,' 个元素：', data_translation[i])
 21 #print('数据转换结果的前9个元素：\n', data_translation[0:8])
 22 
 23 
 24 #########################################################
 25 
 26 
 27 def loadSimpleData():
 28     for i in range(3):
 29         print("********")
 30     '''
 31     simpleData = [['beer', 'milk', 'chicken'], 
 32                  ['milk', 'bread'], 
 33                  ['milk', 'diaper'],
 34                  ['beer', 'milk', 'bread'], 
 35                  ['beer', 'diaper'], 
 36                  ['milk', 'diaper'],
 37                  ['beer', 'diaper'], 
 38                  ['beer', 'milk', 'diaper', 'chicken'], 
 39                  ['beer', 'milk', 'diaper']]
 40     return simpleData
 41     '''
 42     simpleData = [data_translation[0],
 43                   data_translation[1],
 44                   data_translation[2],
 45                   data_translation[3],
 46                   data_translation[4],
 47                   data_translation[5],
 48                   data_translation[6],
 49                   data_translation[7],
 50                   data_translation[8]]
 51     return simpleData
 52 
 53 
 54 
 55 def createInitSet(dataSet: list) -> dict:
 56     returnSet = {}
 57 
 58     for item in dataSet:
 59         frozenItem = frozenset(item)
 60         returnSet[frozenItem] = returnSet.get(frozenItem, 0) + 1
 61 
 62     return returnSet
 63 
 64 
 65 class TreeNode(object):
 66     def __init__(self, nameValue: str, numOccur: int, parentNode):
 67         # 项的名字
 68         self.name = nameValue
 69         # 项在FPTree当中出现的次数
 70         self.count = numOccur
 71         # 相同项的下一个节点
 72         self.nodeLink = None
 73         # 父节点
 74         self.parentNode = parentNode
 75         # 子节点
 76         # for example, the children like 'milk': TreeNode('milk')
 77         self.children = {}
 78 
 79     def inc(self, count):
 80         self.count += count
 81 
 82     def show(self, ind=1):
 83         print('   ' * ind, self.name, ' ', self.count)
 84         for child in self.children.values():
 85             child.show(ind + 1)
 86 
 87 
 88 def getHeaderTable(dataSet, minSupport=1) -> dict:
 89     headerTable = {}
 90 
 91     for key, value in dataSet.items():
 92         for item in key:
 93             headerTable[item] = headerTable.get(item, 0) + value
 94 
 95     lessThanMinSupportList = list(filter(lambda k: headerTable[k] < minSupport, headerTable))
 96     for x in lessThanMinSupportList:
 97         del headerTable[x]
 98 
 99     return headerTable
100 
101 
102 def makeHeaderTable(headerTable: dict) -> dict:
103     for item in headerTable:
104         headerTable[item] = [headerTable[item], None]
105 
106     return headerTable
107 
108 
109 def updateHeaderTable(toastNode: TreeNode, targetNode: TreeNode):
110     while toastNode.nodeLink is not None:
111         toastNode = toastNode.nodeLink
112     toastNode.nodeLink = targetNode
113 
114 
115 class FPTree:
116     def __init__(self, frozenDataDict: dict, headerTable: dict, minSupport: int):
117         self.treeNode = TreeNode('null', 1, None)
118         # 'milk': [counter, nodeLink]
119         self.headerTable = makeHeaderTable(headerTable)
120         self.frozenDataDict = frozenDataDict
121         self.minSupport = minSupport
122 
123     def updateTree(self, treeNode, items: list, count: int):
124         item = items[0]
125         if item in treeNode.children:
126             treeNode.children[item].inc(count)
127         else:
128             treeNode.children[item] = TreeNode(item, count, treeNode)
129             if self.headerTable[item][1] is None:
130                 self.headerTable[item][1] = treeNode.children[item]
131             else:
132                 updateHeaderTable(self.headerTable[item][1], treeNode.children[item])
133         if len(items) > 1:
134             self.updateTree(treeNode.children[item], items[1::], count)
135 
136     def createFPTree(self):
137         freqItems = set(self.headerTable.keys())
138 
139         if len(freqItems) == 0:
140             self.headerTable = None
141             return
142 
143         for transaction, count in self.frozenDataDict.items():
144             learnSet = {}
145             for item in transaction:
146                 if item in freqItems:
147                     learnSet[item] = self.headerTable[item][0]
148 
149             if len(learnSet) > 0:
150                 orderedItems = [item[0] for item in sorted(learnSet.items(), key=lambda k: (k[1], k[0]), reverse=True)]
151                 self.updateTree(self.treeNode, orderedItems, count)
152 
153 
154 def main():
155     data = loadSimpleData()
156     dataDict = createInitSet(data)
157     headerTable = getHeaderTable(dataDict, 3)
158     fpTree = FPTree(dataDict, headerTable, 3)
159     fpTree.createFPTree()
160     fpTree.treeNode.show()
161 
162 main()

posted @ 2023-03-19 23:53 孤影化双皮奶阅读(51) 评论(0) 收藏举报

刷新页面返回顶部

python数据挖掘作业____商品间/商品内分析

公告