协同过滤user_cf、item_cf
1.推荐引擎/召回/match----》物品候选集 ----》过滤(业务需求、规则,不同策略的去重)-----》排序(保证公平性)----》策略(增加解释性)----》推荐列表
【代码】
1.将训练数据存入到字典中,dict = {user_id1:{item_id1:score...}, user_id2:{item_id2:score..}....}
1 import pandas as pd 2 3 #获取训练集数据 4 df = pd.read_csv('../raw_data/u.data' 5 ,sep='\t' #指定分隔符 6 ,nrows=10 #显示10行 7 ,names=['usr_id','item_id','rating','timestamp']) #设置列名 8 9 # print(df.head()) 10 # print(df.dtypes) 11 ''' 12 usr_id int64 13 item_id int64 14 rating int64 15 timestamp int64 16 dtype: object 17 ''' 18 19 #转换为字典:dict = {user_id1:{item_id1:score...}, user_id2:{item_id2:score..}....} 20 d = dict() 21 # _: 表示索引 row:表示获取的行数据 22 ''' 23 usr_id 196 24 item_id 242 25 rating 3 26 timestamp 881250949 27 Name: 0, dtype: int64 28 ''' 29 for _,row in df.iterrows(): 30 # print(row) 31 user_id = str(row['usr_id']) 32 # print(user_id) 33 item_id = str(row['item_id']) 34 rating = row['rating'] 35 36 if d.get(user_id, -1) == -1: 37 d[user_id] = {item_id: rating} 38 else: 39 d[user_id][item_id] = rating 40 # print(d) 41 # {'196': {'242': 3}, '186': {'302': 3}, '22': {'377': 1}, '244': {'51': 2}, '166': {'346': 1}, '298': {'474': 4}, '115': {'265': 2}, '253': {'465': 5}, '305': {'451': 3}, '6': {'86': 3}}
封装成一个函数
1 import pandas as pd 2 3 # 常用路径变量config 4 train_data = './mid_data/train.data' 5 sim_user_user = './mid_data/sim_user_user.txt' 6 7 # 获取训练集数据 8 def gene_train_data(nrows): 9 df = pd.read_csv('../raw_data/u.data' 10 ,sep='\t' 11 ,nrows=nrows 12 ,names=['user_id', 'item_id', 'rating', 'timestamp']) 13 14 # 转换为字典: dict = {user_id1:{item_id1:score}, user_id2:{item_id2:score}} 15 d = dict() 16 17 for _, row in df.iterrows(): 18 # print(row) 19 user_id = str(row['user_id']) 20 # print(user_id) 21 item_id = str(row['item_id']) 22 rating = row['rating'] 23 if d.get(user_id, -1) == -1: 24 #if user_id not in d.keys(): 25 d[user_id] = {item_id: rating} 26 else: 27 d[user_id][item_id] = rating 28 return d
user_cf
1 from cf.utils import gene_train_data 2 import math 3 4 # d = {user_id1:{item_id1:score}, user_id2:{item_id2:score}} 5 d = gene_train_data(nrows=None) 6 7 # 统一参数控制 8 train_data = './mid_data/train.data' 9 sim_user_user = './mid_data/sim_user_user.txt' 10 # with open(train_data, 'w') as f: 11 # f.write(str(d)) 12 13 # 1、获取用户与用户之间的相似度 14 # 1.1、使用正常逻辑计算用户相似度 15 def user_normal_simmilarity(d): 16 w = dict() 17 for u in d.keys(): 18 if u not in w: 19 w[u] = dict() 20 for v in d.keys(): 21 if u == v: continue 22 # jaccard distance 23 w[u][v] = len(set(d[u]) & set(d[v])) 24 w[u][v] = 2 * w[u][v] / (len(d[u]) + len(d[v])) * 1.0 25 print(w) 26 # 和196相似用户的分值 27 print(w['196']) 28 # 获取所有的用户量 29 print('all user cnt: ', len(w.keys())) 30 # 对于196用户,相似的用户量 31 print('user_196 sim user cnt:', len(w['196'])) 32 33 34 # 1.2 优化用户与用户之间的相似度 user->item => item->user 35 36 # 进行user 初始化,建立item->user 的倒排表 37 def user_sim(d): 38 item_user = dict() 39 for u, items in d.items(): 40 for i in items.keys(): 41 if i not in item_user: 42 item_user[i] = set() 43 item_user[i].add(u) 44 # print(item_user) 45 # 统计用户与用户共同的item数量 46 C = dict() 47 for i, users in item_user.items(): 48 for u in users: 49 if C.get(u, -1) == -1: C[u] = dict() 50 51 for v in users: 52 if u == v: continue 53 if C[u].get(v, -1) == -1: C[u][v] = 0 54 C[u][v] += 1 55 # 热门程度的降权操作 56 C[u][v] += 1 / math.log(1 + len(item_user[i])) 57 58 # 内存中删除 59 del item_user 60 61 for u, sim_users in C.items(): 62 for v, cuv in sim_users.items(): 63 C[u][v] = 2 * C[u][v] / float(len(d[u]) + len(d[v])) 64 65 # print(C['196']) 66 # print('all use cnt :', len(C.keys())) 67 # print('user_196 sim user cnt:', len(C['196'])) 68 return C 69 70 # C = user_sim(d) 71 # with open(sim_user_user, 'w') as fw: 72 # fw.write(str(C)) 73 74 # 3、定义推荐函数 75 def recommend(user, d, C, k): 76 rank = dict() 77 interated_items = d[user].keys() 78 for v, cuv in sorted(C[user].items(), key=lambda x:x[1], reverse=True)[0:k]: 79 for i, rating in d[v].items(): 80 if i in interated_items: #user看过的电影就不推荐了,所以需要continue 81 continue 82 elif rank.get(i, -1) == -1: 83 rank[i] = 0 84 85 rank[i] += cuv * rating #cuv用户和用户的相似度*用户对电影的打分 86 87 return rank 88 89 # 4、使用main函数进行处理 90 if __name__ == '__main__': 91 # 读取训练集数据 92 d = dict() 93 with open(train_data, 'r') as ft: 94 d = eval(ft.read()) 95 96 # 读取用户相似度矩阵 97 C = dict() 98 with open(sim_user_user, 'r') as fc: 99 C = eval(fc.read()) 100 101 user = '196' 102 k = 5 103 rank = recommend(user, d, C, k) 104 print(sorted(rank.items(), key=lambda x:x[1], reverse=True)[0:10]) 105 106 # [('50', 6.115647208206193), ('237', 4.459391183529116), ('181', 4.459264873057977), ('258', 3.8173018753781003), ('748', 3.728956343112423), ('15', 3.4293798155867123), ('111', 3.1820718906018364), ('222', 3.172034861690034), ('117', 3.1627131489200453), ('151', 3.070885436402678)]
item_cf
1 from cf.utils import gene_train_data 2 3 # 定义item 和 item 的 sim_path 4 sim_item_item = './mid_data/sim_item_item.txt' 5 6 def item_sim(d): 7 # 计算物品与物品的相似度矩阵 用空间换时间 8 # {item1:{sim_item1:score, sim_item2:sim_score}, item2:{...}...} 9 C = dict() 10 11 N = dict() #获取item对应的用户集合 12 # d.items(): {user_id1:{item_id1:rating, item_id2:rating}, user_id2:{...}...} 13 for u, items in d.items(): 14 for i in items:#item_id1 15 if N.get(i, -1) == -1: 16 N[i] = 0 17 N[i] += 1 18 if C.get(i, -1) == -1: 19 # {sim_item1:score, sim_item2:sim_score} :初始化这个字典C[i] 20 C[i] = dict() 21 for j in items: 22 if i == j : continue 23 elif C[i].get(j ,-1) == -1: C[i][j] = 0 #j=sim_item1 .... 24 C[i][j] += 1 #有相同(共同)的user就加1 25 26 for i, realted_items in C.items(): #找到和i相似的item 27 for j, cij in realted_items.items(): #{sim_item1:score, sim_item2:sim_score} 28 C[i][j] = 2 * cij / (N[i]+N[j]*1.0) #cij:score 29 30 return C 31 32 # 2、定义推荐函数 33 def recommendation(d, user_id, C, k): 34 rank = dict() #推荐列表 35 Ru = d[user_id] #获取用户历史的item # d = {user_id1:{item_id1:score}, user_id2:{item_id2:score}} 36 for i, rating in Ru.items(): 37 for j, sim_score in sorted(C[i].items(),key= lambda x:x[1], reverse=True)[0:k]: 38 if j in Ru: #打过分的item,进行continue 39 continue 40 elif rank.get(j, -1) == -1: 41 rank[j] = 0 42 rank[j] += sim_score * rating 43 return rank #item的item相似度前k个最高的 44 45 46 # 3、测试验证 47 if __name__ == '__main__': 48 d = dict() 49 with open(gene_train_data, 'r') as ft: 50 d = eval(ft.read()) 51 52 # 生成相似度矩阵并存储到mid_data 53 # C = item_sim(d) #生成相似度矩阵C 54 # with open(sim_item_item, 'w') as wf: 55 # wf.write(str(C)) 56 57 with open(sim_item_item, 'r') as rf: 58 C = eval(rf.read()) #读取相似度矩阵 59 60 rank = recommendation(d, user_id='196', C=C, k=5) #每个与item相似度最高的前5个 61 print(sorted(rank.items(), key=lambda x:x[1], reverse=True)[:10]) #倒序 在所有推荐列表里获取前10个分数最高的。
浙公网安备 33010602011771号