python数据挖掘作业____商品间/商品内分析
1 # -*- coding: utf-8 -*- 2 3 # 代码8-1 查看数据特征 4 5 import numpy as np 6 import pandas as pd 7 8 inputfile = 'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv' # 输入的数据文件 9 data = pd.read_csv(inputfile,encoding = 'gbk') # 读取数据 10 data .info() # 查看数据属性 11 12 data = data['id'] 13 description = [data.count(),data.min(), data.max()] # 依次计算总数、最小值、最大值 14 description = pd.DataFrame(description, index = ['Count','Min', 'Max']).T # 将结果存入数据框 15 print('描述性统计结果:\n',np.round(description)) # 输出结果
1 # 销量排行前10商品的销量及其占比 2 import pandas as pd 3 inputfile = 'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv' # 输入的数据文件 4 data = pd.read_csv(inputfile,encoding = 'gbk') # 读取数据 5 group = data.groupby(['Goods']).count().reset_index() # 对商品进行分类汇总 6 sorted=group.sort_values('id',ascending=False) 7 print('销量排行前10商品的销量:\n', sorted[:10]) # 排序并查看前10位热销商品
1 # 画条形图展示出销量排行前10商品的销量 2 import matplotlib.pyplot as plt 3 x=sorted[:10]['Goods'] 4 y=sorted[:10]['id'] 5 plt.figure(figsize = (8, 4)) # 设置画布大小 6 plt.barh(x,y) 7 plt.rcParams['font.sans-serif'] = 'SimHei' 8 plt.xlabel('销量') # 设置x轴标题 9 plt.ylabel('商品类别') # 设置y轴标题 10 plt.title('商品的销量TOP10--3009') # 设置标题 11 plt.savefig('D://CourseAssignment//AI//GoodSellMod//tmp//top10.png') # 把图片以.png格式保存 12 plt.show() # 展示图片
1 # 销量排行前10商品的销量占比 2 data_nums = data.shape[0] 3 for idnex, row in sorted[:10].iterrows(): 4 print(row['Goods'],row['id'],row['id']/data_nums)
1 # 代码8-3 各类别商品的销量及其占比 2 3 import pandas as pd 4 inputfile1 = 'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv' 5 inputfile2 = 'D://CourseAssignment//AI//GoodSellMod//GoodsTypes.csv' 6 data = pd.read_csv(inputfile1,encoding = 'gbk') 7 types = pd.read_csv(inputfile2,encoding = 'gbk') # 读入数据 8 9 group = data.groupby(['Goods']).count().reset_index() 10 sort = group.sort_values('id',ascending = False).reset_index() 11 data_nums = data.shape[0] # 总量 12 del sort['index'] 13 14 sort_links = pd.merge(sort,types) # 合并两个datafreame 根据type 15 # 根据类别求和,每个商品类别的总量,并排序 16 sort_link = sort_links.groupby(['Types']).sum().reset_index() 17 sort_link = sort_link.sort_values('id',ascending = False).reset_index() 18 del sort_link['index'] # 删除“index”列 19 20 # 求百分比,然后更换列名,最后输出到文件 21 sort_link['count'] = sort_link.apply(lambda line: line['id']/data_nums,axis=1) 22 sort_link.rename(columns = {'count':'percent'},inplace = True) 23 print('各类别商品的销量及其占比:\n',sort_link) 24 outfile1 = 'D://CourseAssignment//AI//GoodSellMod//tmp//percent.csv' 25 sort_link.to_csv(outfile1,index = False,header = True,encoding='gbk') # 保存结果
1 # 画饼图展示每类商品销量占比 2 import matplotlib.pyplot as plt 3 data = sort_link['percent'] 4 labels = sort_link['Types'] 5 plt.figure(figsize=(8, 6)) # 设置画布大小 6 plt.pie(data,labels=labels,autopct='%1.2f%%') 7 plt.rcParams['font.sans-serif'] = 'SimHei' 8 plt.title('每类商品销量占比--3009') # 设置标题 9 plt.savefig('D://CourseAssignment//AI//GoodSellMod//tmp//persent.png') # 把图片以.png格式保存 10 plt.show()
1 # 代码8-4 非酒精饮料内部商品的销量及其占比 2 3 # 先筛选“非酒精饮料”类型的商品,然后求百分比,然后输出结果到文件。 4 selected = sort_links.loc[sort_links['Types'] == '非酒精饮料'] # 挑选商品类别为“非酒精饮料”并排序 5 child_nums = selected['id'].sum() # 对所有的“非酒精饮料”求和 6 selected['child_percent'] = selected.apply(lambda line: line['id']/child_nums,axis = 1) # 求百分比 7 selected.rename(columns = {'id':'count'},inplace = True) 8 print('非酒精饮料内部商品的销量及其占比:\n',selected) 9 outfile2 = 'D://CourseAssignment//AI//GoodSellMod//tmp//child_percent.csv' 10 sort_link.to_csv(outfile2,index = False,header = True,encoding='gbk') # 输出结果
1 # 画饼图展示非酒精饮品内部各商品的销量占比 2 import matplotlib.pyplot as plt 3 data = selected['child_percent'] 4 labels = selected['Goods'] 5 plt.figure(figsize = (8,6)) # 设置画布大小 6 explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3) # 设置每一块分割出的间隙大小 7 plt.pie(data,explode = explode,labels = labels,autopct = '%1.2f%%', 8 pctdistance = 1.1,labeldistance = 1.2) 9 plt.rcParams['font.sans-serif'] = 'SimHei' 10 plt.title("非酒精饮料内部各商品的销量占比--3009") # 设置标题 11 plt.axis('equal') 12 plt.savefig('D://CourseAssignment//AI//GoodSellMod//tmp//child_persent.png') # 保存图形 13 plt.show() # 展示图形
1 # 先筛选“西点”类型的商品,然后求百分比,然后输出结果到文件。 2 selected = sort_links.loc[sort_links['Types'] == '西点'] # 挑选商品类别为“非酒精饮料”并排序 3 child_nums = selected['id'].sum() # 对所有的“非酒精饮料”求和 4 selected['child_percent'] = selected.apply(lambda line: line['id']/child_nums,axis = 1) # 求百分比 5 selected.rename(columns = {'id':'count'},inplace = True) 6 print('西点内部商品的销量及其占比:\n',selected) 7 outfile3 = 'D://CourseAssignment//AI//GoodSellMod//tmp//bread_precent.csv' 8 sort_link.to_csv(outfile3,index = False,header = True,encoding='gbk') # 输出结果
1 # 画饼图展示西点内部各商品的销量占比 2 import matplotlib.pyplot as plt 3 data = selected['child_percent'] 4 labels = selected['Goods'] 5 plt.figure(figsize = (8,6)) # 设置画布大小 6 explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3) # 设置每一块分割出的间隙大小 7 plt.pie(data,explode = None,labels = labels,autopct = '%1.2f%%', 8 pctdistance = 1.1,labeldistance = 1.2) 9 plt.rcParams['font.sans-serif'] = 'SimHei' 10 plt.title("西点内部各商品的销量占比--3009") # 设置标题 11 plt.axis('equal') 12 plt.savefig('D://CourseAssignment//AI//GoodSellMod//tmp//bread_precent.png') # 保存图形 13 plt.show() # 展示图形
1 # -*- coding: utf-8 -*- 2 import pandas as pd 3 inputfile='D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv' 4 data = pd.read_csv(inputfile,encoding = 'gbk') 5 6 # 根据id对“Goods”列合并,并使用“,”将各商品隔开 7 data['Goods'] = data['Goods'].apply(lambda x:','+x) 8 data = data.groupby('id').sum().reset_index() 9 10 # 对合并的商品列转换数据格式 11 data['Goods'] = data['Goods'].apply(lambda x :[x[1:]]) 12 data_list = list(data['Goods']) 13 14 # 分割商品名为每个元素 15 data_translation = [] 16 for i in data_list: 17 p = i[0].split(',') 18 data_translation.append(p) 19 for i in range(9): 20 print('数据转换结果的第 ',i+1,' 个元素:', data_translation[i]) 21 #print('数据转换结果的前9个元素:\n', data_translation[0:8]) 22 23 24 ######################################################### 25 26 27 def loadSimpleData(): 28 for i in range(3): 29 print("********") 30 ''' 31 simpleData = [['beer', 'milk', 'chicken'], 32 ['milk', 'bread'], 33 ['milk', 'diaper'], 34 ['beer', 'milk', 'bread'], 35 ['beer', 'diaper'], 36 ['milk', 'diaper'], 37 ['beer', 'diaper'], 38 ['beer', 'milk', 'diaper', 'chicken'], 39 ['beer', 'milk', 'diaper']] 40 return simpleData 41 ''' 42 simpleData = [data_translation[0], 43 data_translation[1], 44 data_translation[2], 45 data_translation[3], 46 data_translation[4], 47 data_translation[5], 48 data_translation[6], 49 data_translation[7], 50 data_translation[8]] 51 return simpleData 52 53 54 55 def createInitSet(dataSet: list) -> dict: 56 returnSet = {} 57 58 for item in dataSet: 59 frozenItem = frozenset(item) 60 returnSet[frozenItem] = returnSet.get(frozenItem, 0) + 1 61 62 return returnSet 63 64 65 class TreeNode(object): 66 def __init__(self, nameValue: str, numOccur: int, parentNode): 67 # 项的名字 68 self.name = nameValue 69 # 项在FPTree当中出现的次数 70 self.count = numOccur 71 # 相同项的下一个节点 72 self.nodeLink = None 73 # 父节点 74 self.parentNode = parentNode 75 # 子节点 76 # for example, the children like 'milk': TreeNode('milk') 77 self.children = {} 78 79 def inc(self, count): 80 self.count += count 81 82 def show(self, ind=1): 83 print(' ' * ind, self.name, ' ', self.count) 84 for child in self.children.values(): 85 child.show(ind + 1) 86 87 88 def getHeaderTable(dataSet, minSupport=1) -> dict: 89 headerTable = {} 90 91 for key, value in dataSet.items(): 92 for item in key: 93 headerTable[item] = headerTable.get(item, 0) + value 94 95 lessThanMinSupportList = list(filter(lambda k: headerTable[k] < minSupport, headerTable)) 96 for x in lessThanMinSupportList: 97 del headerTable[x] 98 99 return headerTable 100 101 102 def makeHeaderTable(headerTable: dict) -> dict: 103 for item in headerTable: 104 headerTable[item] = [headerTable[item], None] 105 106 return headerTable 107 108 109 def updateHeaderTable(toastNode: TreeNode, targetNode: TreeNode): 110 while toastNode.nodeLink is not None: 111 toastNode = toastNode.nodeLink 112 toastNode.nodeLink = targetNode 113 114 115 class FPTree: 116 def __init__(self, frozenDataDict: dict, headerTable: dict, minSupport: int): 117 self.treeNode = TreeNode('null', 1, None) 118 # 'milk': [counter, nodeLink] 119 self.headerTable = makeHeaderTable(headerTable) 120 self.frozenDataDict = frozenDataDict 121 self.minSupport = minSupport 122 123 def updateTree(self, treeNode, items: list, count: int): 124 item = items[0] 125 if item in treeNode.children: 126 treeNode.children[item].inc(count) 127 else: 128 treeNode.children[item] = TreeNode(item, count, treeNode) 129 if self.headerTable[item][1] is None: 130 self.headerTable[item][1] = treeNode.children[item] 131 else: 132 updateHeaderTable(self.headerTable[item][1], treeNode.children[item]) 133 if len(items) > 1: 134 self.updateTree(treeNode.children[item], items[1::], count) 135 136 def createFPTree(self): 137 freqItems = set(self.headerTable.keys()) 138 139 if len(freqItems) == 0: 140 self.headerTable = None 141 return 142 143 for transaction, count in self.frozenDataDict.items(): 144 learnSet = {} 145 for item in transaction: 146 if item in freqItems: 147 learnSet[item] = self.headerTable[item][0] 148 149 if len(learnSet) > 0: 150 orderedItems = [item[0] for item in sorted(learnSet.items(), key=lambda k: (k[1], k[0]), reverse=True)] 151 self.updateTree(self.treeNode, orderedItems, count) 152 153 154 def main(): 155 data = loadSimpleData() 156 dataDict = createInitSet(data) 157 headerTable = getHeaderTable(dataDict, 3) 158 fpTree = FPTree(dataDict, headerTable, 3) 159 fpTree.createFPTree() 160 fpTree.treeNode.show() 161 162 main()