1 # 代码11-1
2
3 import os
4 import pandas as pd
5
6
7 # 修改工作路径到指定文件夹
8 os.chdir("D:\shuzituxiangchuli\JupyterLab-Portable-3.1.0-3.9\notebooks\数据挖掘\电子商务")
9
10
11
12 # 第二种连接方式
13 import pymysql as pm
14
15 con = pm.connect(host='localhost',user='root',password='aA111111',database='test',charset='utf8')
16 data = pd.read_sql('select * from all_gzdata',con=con)
17 con.close() #关闭连接
18
19 # 保存读取的数据
20 data.to_csv('D:\shuzituxiangchuli\JupyterLab-Portable-3.1.0-3.9\notebooks\数据挖掘\电子商务)
1 # 代码11-2
2
3 import pandas as pd
4 from sqlalchemy import create_engine
5
6 engine = create_engine('mysql+pymysql://root:aA111111@localhost/test?charset=utf8')
7 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
8 # 分析网页类型
9 counts = [i['fullURLId'].value_counts() for i in sql] #逐块统计
10 counts = counts.copy()
11 counts = pd.concat(counts).groupby(level=0).sum() # 合并统计结果,把相同的统计项合并(即按index分组并求和)
12 counts = counts.reset_index() # 重新设置index,将原来的index作为counts的一列。
13 counts.columns = ['index', 'num'] # 重新设置列名,主要是第二列,默认为0
14 counts['type'] = counts['index'].str.extract('(\d{3})') # 提取前三个数字作为类别id
15 counts_ = counts[['type', 'num']].groupby('type').sum() # 按类别合并
16 counts_.sort_values(by='num', ascending=False, inplace=True) # 降序排列
17 counts_['ratio'] = counts_.iloc[:,0] / counts_.iloc[:,0].sum()
18 print(counts_)
![]()
1 # 代码11-3
2
3 # 因为只有107001一类,但是可以继续细分成三类:知识内容页、知识列表页、知识首页
4 def count107(i): #自定义统计函数
5 j = i[['fullURL']][i['fullURLId'].str.contains('107')].copy() # 找出类别包含107的网址
6 j['type'] = None # 添加空列
7 j['type'][j['fullURL'].str.contains('info/.+?/')]= '知识首页'
8 j['type'][j['fullURL'].str.contains('info/.+?/.+?')]= '知识列表页'
9 j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')]= '知识内容页'
10 return j['type'].value_counts()
11 # 注意:获取一次sql对象就需要重新访问一下数据库(!!!)
12 #engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
13 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
14
15 counts2 = [count107(i) for i in sql] # 逐块统计
16 counts2 = pd.concat(counts2).groupby(level=0).sum() # 合并统计结果
17 print(counts2)
18 #计算各个部分的占比
19 res107 = pd.DataFrame(counts2)
20 # res107.reset_index(inplace=True)
21 res107.index.name= '107类型'
22 res107.rename(columns={'type':'num'}, inplace=True)
23 res107['比例'] = res107['num'] / res107['num'].sum()
24 res107.reset_index(inplace = True)
25 print(res107)
![]()
1 # 代码11-4
2
3 def countquestion(i): # 自定义统计函数
4 j = i[['fullURLId']][i['fullURL'].str.contains('\?')].copy() # 找出类别包含107的网址
5 return j
6
7 #engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
8 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
9
10 counts3 = [countquestion(i)['fullURLId'].value_counts() for i in sql]
11 counts3 = pd.concat(counts3).groupby(level=0).sum()
12 print(counts3)
13
14 # 求各个类型的占比并保存数据
15 df1 = pd.DataFrame(counts3)
16 df1['perc'] = df1['fullURLId']/df1['fullURLId'].sum()*100
17 df1.sort_values(by='fullURLId',ascending=False,inplace=True)
18 print(df1.round(4))
![]()
1 # 代码11-5
2
3 def page199(i): #自定义统计函数
4 j = i[['fullURL','pageTitle']][(i['fullURLId'].str.contains('199')) &
5 (i['fullURL'].str.contains('\?'))]
6 j['pageTitle'].fillna('空',inplace=True)
7 j['type'] = '其他' # 添加空列
8 j['type'][j['pageTitle'].str.contains('法律快车-律师助手')]= '法律快车-律师助手'
9 j['type'][j['pageTitle'].str.contains('咨询发布成功')]= '咨询发布成功'
10 j['type'][j['pageTitle'].str.contains('免费发布法律咨询' )] = '免费发布法律咨询'
11 j['type'][j['pageTitle'].str.contains('法律快搜')] = '快搜'
12 j['type'][j['pageTitle'].str.contains('法律快车法律经验')] = '法律快车法律经验'
13 j['type'][j['pageTitle'].str.contains('法律快车法律咨询')] = '法律快车法律咨询'
14 j['type'][(j['pageTitle'].str.contains('_法律快车')) |
15 (j['pageTitle'].str.contains('-法律快车'))] = '法律快车'
16 j['type'][j['pageTitle'].str.contains('空')] = '空'
17
18 return j
19
20 # 注意:获取一次sql对象就需要重新访问一下数据库
21 #engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
22 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息
23 #sql = pd.read_sql_query('select * from all_gzdata limit 10000', con=engine)
24
25 counts4 = [page199(i) for i in sql] # 逐块统计
26 counts4 = pd.concat(counts4)
27 d1 = counts4['type'].value_counts()
28 print(d1)
29 d2 = counts4[counts4['type']=='其他']
30 print(d2)
31 # 求各个部分的占比并保存数据
32 df1_ = pd.DataFrame(d1)
33 df1_['perc'] = df1_['type']/df1_['type'].sum()*100
34 df1_.sort_values(by='type',ascending=False,inplace=True)
35 print(df1_)
![]()
1 # 代码11-6
2
3 def xiaguang(i): #自定义统计函数
4 j = i.loc[(i['fullURL'].str.contains('\.html'))==False,
5 ['fullURL','fullURLId','pageTitle']]
6 return j
7
8 # 注意获取一次sql对象就需要重新访问一下数据库
9 engine = create_engine('mysql+pymysql://root:aA111111@localhost/test?charset=utf8')
10 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息
11
12 counts5 = [xiaguang(i) for i in sql]
13 counts5 = pd.concat(counts5)
14
15 xg1 = counts5['fullURLId'].value_counts()
16 print(xg1)
17 # 求各个部分的占比
18 xg_ = pd.DataFrame(xg1)
19 xg_.reset_index(inplace=True)
20 xg_.columns= ['index', 'num']
21 xg_['perc'] = xg_['num']/xg_['num'].sum()*100
22 xg_.sort_values(by='num',ascending=False,inplace=True)
23
24 xg_['type'] = xg_['index'].str.extract('(\d{3})') #提取前三个数字作为类别id
25
26 xgs_ = xg_[['type', 'num']].groupby('type').sum() #按类别合并
27 xgs_.sort_values(by='num', ascending=False,inplace=True) #降序排列
28 xgs_['percentage'] = xgs_['num']/xgs_['num'].sum()*100
29
30 print(xgs_.round(4))
![]()
1 # 代码11-7
2
3 # 分析网页点击次数
4 # 统计点击次数
5 engine = create_engine('mysql+pymysql://root:aA111111@localhost/test?charset=utf8')
6 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息
7
8 counts1 = [i['realIP'].value_counts() for i in sql] # 分块统计各个IP的出现次数
9 counts1 = pd.concat(counts1).groupby(level=0).sum() # 合并统计结果,level=0表示按照index分组
10 print(counts1)
11
12 counts1_ = pd.DataFrame(counts1)
13 counts1_
14 counts1['realIP'] = counts1.index.tolist()
15
16 counts1_[1]=1 # 添加1列全为1
17 hit_count = counts1_.groupby('realIP').sum() # 统计各个“不同点击次数”分别出现的次数
18 # 也可以使用counts1_['realIP'].value_counts()功能
19 hit_count.columns=[u'用户数']
20 hit_count.index.name =u'点击次数'
21
22 # 统计1~7次、7次以上的用户人数
23 hit_count.sort_index(inplace = True)
24 hit_count_7 = hit_count.iloc[:7,:]
25 time = hit_count.iloc[7:,0].sum() # 统计点击次数7次以上的用户数
26 hit_count_7 = hit_count_7.append([{u'用户数':time}], ignore_index=True)
27 hit_count_7.index = ['1','2','3','4','5','6','7','7次以上']
28 hit_count_7[u'用户比例'] = hit_count_7[u'用户数'] / hit_count_7[u'用户数'].sum()
29 print(hit_count_7)
![]()
1 # 代码11-8
2 import pandas as pd
3
4 # 分析浏览一次的用户行为
5 from sqlalchemy import create_engine
6 engine = create_engine('mysql+pymysql://root:aA111111@localhost/test?charset=utf8')
7 all_gzdata = pd.read_sql_table('all_gzdata', con = engine.connect()) # 读取all_gzdata数据
8
9 #对realIP进行统计
10 # 提取浏览1次网页的数据
11 real_count = pd.DataFrame(all_gzdata.groupby("realIP")["realIP"].count())
12 real_count.columns = ["count"]
13 real_countindex=real_count.index.tolist()
14 user_one = real_count[(real_count["count"] == 1)] # 提取只登录一次的用户
15 # 通过realIP与原始数据合并
16 real_one = pd.merge(user_one, all_gzdata, left_on="realIP", right_on="realIP")
17
18 # 统计浏览一次的网页类型
19 URL_count = pd.DataFrame(real_one.groupby("fullURLId")["fullURLId"].count())
20 URL_count.columns = ["count"]
21 URL_count.sort_values(by='count', ascending=False, inplace=True) # 降序排列
22 # 统计排名前4和其他的网页类型
23 URL_count_4 = URL_count.iloc[:4,:]
24 time = hit_count.iloc[4:,0].sum() # 统计其他的
25 URLindex = URL_count_4.index.values
26 URL_count_4 = URL_count_4.append([{'count':time}], ignore_index=True)
27 URL_count_4.index = [URLindex[0], URLindex[1], URLindex[2], URLindex[3],
28 '其他']
29 URL_count_4['比例'] = URL_count_4['count'] / URL_count_4['count'].sum()
30 print(URL_count_4)
![]()
1 # 代码11-9
2
3 # 在浏览1次的前提下, 得到的网页被浏览的总次数
4 fullURL_count = pd.DataFrame(real_one.groupby("fullURL")["fullURL"].count())
5 fullURL_count.columns = ["count"]
6 fullURL_count["fullURL"] = fullURL_count.index.tolist()
7 fullURL_count.sort_values(by='count', ascending=False, inplace=True) # 降序排列
1 # 代码11-10
2
3 import os
4 import re
5 import pandas as pd
6 import pymysql as pm
7 from random import sample
8
9 # 修改工作路径到指定文件夹
10 os.chdir("
D:\shuzituxiangchuli\JupyterLab-Portable-3.1.0-3.9\notebooks\数据挖掘\电子商务"
") 11 12 # 读取数据 13 con = pm.connect(host='localhost',user='root',password='aA111111',database='test',charset='utf8') 14 data = pd.read_sql('select * from all_gzdata',con=con) 15 con.close() #关闭连接 16 17 # 取出107类型数据 18 index107 = [re.search('107',str(i))!=None for i in data.loc[:,'fullURLId']] 19 data_107 = data.loc[index107,:] 20 21 # 在107类型中筛选出婚姻类数据 22 index = [re.search('hunyin',str(i))!=None for i in data_107.loc[:,'fullURL']] 23 data_hunyin = data_107.loc[index,:] 24 25 # 提取所需字段(realIP、fullURL) 26 info = data_hunyin.loc[:,['realIP','fullURL']] 27 28 # 去除网址中“?”及其后面内容 29 da = [re.sub('\?.*','',str(i)) for i in info.loc[:,'fullURL']] 30 info.loc[:,'fullURL'] = da # 将info中‘fullURL’那列换成da 31 # 去除无html网址 32 index = [re.search('\.html',str(i))!=None for i in info.loc[:,'fullURL']] 33 index.count(True) # True 或者 1 , False 或者 0 34 info1 = info.loc[index,:]
1 # 代码11-11
2
3 # 找出翻页和非翻页网址
4 index = [re.search('/\d+_\d+\.html',i)!=None for i in info1.loc[:,'fullURL']]
5 index1 = [i==False for i in index]
6 info1_1 = info1.loc[index,:] # 带翻页网址
7 info1_2 = info1.loc[index1,:] # 无翻页网址
8 # 将翻页网址还原
9 da = [re.sub('_\d+\.html','.html',str(i)) for i in info1_1.loc[:,'fullURL']]
10 info1_1.loc[:,'fullURL'] = da
11 # 翻页与非翻页网址合并
12 frames = [info1_1,info1_2]
13 info2 = pd.concat(frames)
14 # 或者
15 info2 = pd.concat([info1_1,info1_2],axis = 0) # 默认为0,即行合并
16 # 去重(realIP和fullURL两列相同)
17 info3 = info2.drop_duplicates()
18 # 将IP转换成字符型数据
19 info3.iloc[:,0] = [str(index) for index in info3.iloc[:,0]]
20 info3.iloc[:,1] = [str(index) for index in info3.iloc[:,1]]
21 len(info3)
1 # 代码11-12
2
3 # 筛选满足一定浏览次数的IP
4 IP_count = info3['realIP'].value_counts()
5 # 找出IP集合
6 IP = list(IP_count.index)
7 count = list(IP_count.values)
8 # 统计每个IP的浏览次数,并存放进IP_count数据框中,第一列为IP,第二列为浏览次数
9 IP_count = pd.DataFrame({'IP':IP,'count':count})
10 # 3.3筛选出浏览网址在n次以上的IP集合
11 n = 2
12 index = IP_count.loc[:,'count']>n
13 IP_index = IP_count.loc[index,'IP']
![]()
1 # 代码11-14
2
3 import pandas as pd
4 # 利用训练集数据构建模型
5 UI_matrix_tr = pd.DataFrame(0,index=IP_tr,columns=url_tr)
6 # 求用户-物品矩阵
7 for i in data_tr.index:
8 UI_matrix_tr.loc[data_tr.loc[i,'realIP'],data_tr.loc[i,'fullURL']] = 1
9 sum(UI_matrix_tr.sum(axis=1))
10
11 # 求物品相似度矩阵(因计算量较大,需要耗费的时间较久)
12 Item_matrix_tr = pd.DataFrame(0,index=url_tr,columns=url_tr)
13 for i in Item_matrix_tr.index:
14 for j in Item_matrix_tr.index:
15 a = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)==2)
16 b = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)!=0)
17 Item_matrix_tr.loc[i,j] = a/b
18
19 # 将物品相似度矩阵对角线处理为零
20 for i in Item_matrix_tr.index:
21 Item_matrix_tr.loc[i,i]=0
22
23 # 利用测试集数据对模型评价
24 IP_te = data_te.iloc[:,0]
25 url_te = data_te.iloc[:,1]
26 IP_te = list(set(IP_te))
27 url_te = list(set(url_te))
28
29 # 测试集数据用户物品矩阵
30 UI_matrix_te = pd.DataFrame(0,index=IP_te,columns=url_te)
31 for i in data_te.index:
32 UI_matrix_te.loc[data_te.loc[i,'realIP'],data_te.loc[i,'fullURL']] = 1
33
34 # 对测试集IP进行推荐
35 Res = pd.DataFrame('NaN',index=data_te.index,
36 columns=['IP','已浏览网址','推荐网址','T/F'])
37 Res.loc[:,'IP']=list(data_te.iloc[:,0])
38 Res.loc[:,'已浏览网址']=list(data_te.iloc[:,1])
39
40 # 开始推荐
41 for i in Res.index:
42 if Res.loc[i,'已浏览网址'] in list(Item_matrix_tr.index):
43 Res.loc[i,'推荐网址'] = Item_matrix_tr.loc[Res.loc[i,'已浏览网址'],
44 :].argmax()
45 if Res.loc[i,'推荐网址'] in url_te:
46 Res.loc[i,'T/F']=UI_matrix_te.loc[Res.loc[i,'IP'],
47 Res.loc[i,'推荐网址']]==1
48 else:
49 Res.loc[i,'T/F'] = False
50
51 # 保存推荐结果
52 Res.to_csv('D:/anaconda/python-work/Three/Res.csv',index=False,encoding='utf8')
1 # 代码11-14
2
3 import pandas as pd
4 # 利用训练集数据构建模型
5 UI_matrix_tr = pd.DataFrame(0,index=IP_tr,columns=url_tr)
6 # 求用户-物品矩阵
7 for i in data_tr.index:
8 UI_matrix_tr.loc[data_tr.loc[i,'realIP'],data_tr.loc[i,'fullURL']] = 1
9 sum(UI_matrix_tr.sum(axis=1))
10
11 # 求物品相似度矩阵(因计算量较大,需要耗费的时间较久)
12 Item_matrix_tr = pd.DataFrame(0,index=url_tr,columns=url_tr)
13 for i in Item_matrix_tr.index:
14 for j in Item_matrix_tr.index:
15 a = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)==2)
16 b = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)!=0)
17 Item_matrix_tr.loc[i,j] = a/b
18
19 # 将物品相似度矩阵对角线处理为零
20 for i in Item_matrix_tr.index:
21 Item_matrix_tr.loc[i,i]=0
22
23 # 利用测试集数据对模型评价
24 IP_te = data_te.iloc[:,0]
25 url_te = data_te.iloc[:,1]
26 IP_te = list(set(IP_te))
27 url_te = list(set(url_te))
28
29 # 测试集数据用户物品矩阵
30 UI_matrix_te = pd.DataFrame(0,index=IP_te,columns=url_te)
31 for i in data_te.index:
32 UI_matrix_te.loc[data_te.loc[i,'realIP'],data_te.loc[i,'fullURL']] = 1
33
34 # 对测试集IP进行推荐
35 Res = pd.DataFrame('NaN',index=data_te.index,
36 columns=['IP','已浏览网址','推荐网址','T/F'])
37 Res.loc[:,'IP']=list(data_te.iloc[:,0])
38 Res.loc[:,'已浏览网址']=list(data_te.iloc[:,1])
39
40 # 开始推荐
41 for i in Res.index:
42 if Res.loc[i,'已浏览网址'] in list(Item_matrix_tr.index):
43 Res.loc[i,'推荐网址'] = Item_matrix_tr.loc[Res.loc[i,'已浏览网址'],
44 :].argmax()
45 if Res.loc[i,'推荐网址'] in url_te:
46 Res.loc[i,'T/F']=UI_matrix_te.loc[Res.loc[i,'IP'],
47 Res.loc[i,'推荐网址']]==1
48 else:
49 Res.loc[i,'T/F'] = False
50
51 # 保存推荐结果
52 Res.to_csv('D:/anaconda/python-work/Three/Res.csv',index=False,encoding='utf8')
![]()