简单的user-based协同过滤算法示例代码

#构造一份打分数据集
1
users = {"小明": {"中国合伙人": 5.0, "太平轮": 3.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 3.0, "肖洛特烦恼": 4.5, "火星救援": 5.0}, 2 "小红":{"小时代4": 4.0, "荒野猎人": 3.0, "我的少女时代": 5.0, "肖洛特烦恼": 5.0, "火星救援": 3.0, "后会无期": 3.0}, 3 "小阳": {"小时代4": 2.0, "中国合伙人": 5.0, "我的少女时代": 3.0, "老炮儿": 5.0, "肖洛特烦恼": 4.5, "速度与激情7": 5.0}, 4 "小四": {"小时代4": 5.0, "中国合伙人": 3.0, "我的少女时代": 4.0, "匆匆那年": 4.0, "速度与激情7": 3.5, "火星救援": 3.5, "后会无期": 4.5}, 5 "六爷": {"小时代4": 2.0, "中国合伙人": 4.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 2.0}, 6 "小李": {"荒野猎人": 5.0, "盗梦空间": 5.0, "我的少女时代": 3.0, "速度与激情7": 5.0, "蚁人": 4.5, "老炮儿": 4.0, "后会无期": 3.5}, 7 "隔壁老王": {"荒野猎人": 5.0, "中国合伙人": 4.0, "我的少女时代": 1.0, "Phoenix": 5.0, "甄嬛传": 4.0, "The Strokes": 5.0}, 8 "邻村小芳": {"小时代4": 4.0, "我的少女时代": 4.5, "匆匆那年": 4.5, "甄嬛传": 2.5, "The Strokes": 3.0} 9 }
 1 #定义几种距离计算函数
 2 #更高效的方式为把得分向量化之后使用scipy中定义的distance方法
 3 
 4 from math import sqrt
 5 def euclidean_dis(rating1, rating2):
 6     """计算2个打分序列间的欧式距离. 输入的rating1和rating2都是打分dict
 7        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
 8     distance = 0
 9     commonRatings = False 
10     for key in rating1:
11         if key in rating2:
12             distance += (rating1[key] - rating2[key])^2
13             commonRatings = True
14     #两个打分序列之间有公共打分电影
15     if commonRatings:
16         return distance
17     #无公共打分电影
18     else:
19         return -1
20 
21 
22 def manhattan_dis(rating1, rating2):
23     """计算2个打分序列间的曼哈顿距离. 输入的rating1和rating2都是打分dict
24        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
25     distance = 0
26     commonRatings = False 
27     for key in rating1:
28         if key in rating2:
29             distance += abs(rating1[key] - rating2[key])
30             commonRatings = True
31     #两个打分序列之间有公共打分电影
32     if commonRatings:
33         return distance
34     #无公共打分电影
35     else:
36         return -1
37 
38 def cos_dis(rating1, rating2):
39     """计算2个打分序列间的cos距离. 输入的rating1和rating2都是打分dict
40        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
41     distance = 0
42     dot_product_1 = 0
43     dot_product_2 = 0
44     commonRatings = False
45     
46     for score in rating1.values():
47         dot_product_1 += score^2
48     for score in rating2.values():
49         dot_product_2 += score^2
50         
51     for key in rating1:
52         if key in rating2:
53             distance += rating1[key] * rating2[key]
54             commonRatings = True
55     #两个打分序列之间有公共打分电影
56     if commonRatings:
57         return 1-distance/sqrt(dot_product_1*dot_product_2)
58     #无公共打分电影
59     else:
60         return -1
61 
62 def pearson_dis(rating1, rating2):
63     """计算2个打分序列间的pearson距离. 输入的rating1和rating2都是打分dict
64        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
65     sum_xy = 0
66     sum_x = 0
67     sum_y = 0
68     sum_x2 = 0
69     sum_y2 = 0
70     n = 0
71     for key in rating1:
72         if key in rating2:
73             n += 1
74             x = rating1[key]
75             y = rating2[key]
76             sum_xy += x * y
77             sum_x += x
78             sum_y += y
79             sum_x2 += pow(x, 2)
80             sum_y2 += pow(y, 2)
81     # now compute denominator
82     denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
83     if denominator == 0:
84         return 0
85     else:
86         return (sum_xy - (sum_x * sum_y) / n) / denominator
 1 #查找最近邻
 2 def computeNearestNeighbor(username, users):
 3     """在给定username的情况下,计算其他用户和它的距离并排序"""
 4     distances = []
 5     for user in users:
 6         if user != username:
 7             #distance = manhattan_dis(users[user], users[username])
 8             distance = pearson_dis(users[user], users[username])
 9             distances.append((distance, user))
10     # 根据距离排序,距离越近,排得越靠前
11     distances.sort()
12     return distances
13 
14 #推荐
15 def recommend(username, users):
16     """对指定的user推荐电影"""
17     # 找到最近邻
18     nearest = computeNearestNeighbor(username, users)[0][1]
19 
20     recommendations = []
21     # 找到最近邻看过,但是我们没看过的电影,计算推荐
22     neighborRatings = users[nearest]
23     userRatings = users[username]
24     for artist in neighborRatings:
25         if not artist in userRatings:
26             recommendations.append((artist, neighborRatings[artist]))
27     results = sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)
28     for result in results:
29         print(result[0], result[1])
1 #测试一下
2 
3 recommend('小阳', users)
4     后会无期 4.5
5     匆匆那年 4.0
6     火星救援 3.5
 1 #简单的张量分解进行打分和推荐
 2 #要用到numpy模块
 3 import numpy
 4 
 5 #手写矩阵分解
 6 #现在有很多很方便对高维矩阵做分解的package,比如libmf, svdfeature等
 7 def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
 8     Q = Q.T
 9     for step in xrange(steps):
10         for i in xrange(len(R)):
11             for j in xrange(len(R[i])):
12                 if R[i][j] > 0:
13                     eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
14                     for k in xrange(K):
15                         P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
16                         Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
17         eR = numpy.dot(P,Q)
18         e = 0
19         for i in xrange(len(R)):
20             for j in xrange(len(R[i])):
21                 if R[i][j] > 0:
22                     e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
23                     for k in xrange(K):
24                         e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
25         if e < 0.001:
26             break
27     return P, Q.T
 1 #读取user数据并用张量分解进行打分
 2 
 3 R = [
 4      [5,3,0,1],
 5      [4,0,3,1],
 6      [1,1,0,5],
 7      [1,0,0,4],
 8      [0,1,5,4],
 9     ]
10 
11 R = numpy.array(R)
12 
13 N = len(R)
14 M = len(R[0])
15 K = 2
16 
17 P = numpy.random.rand(N,K)
18 Q = numpy.random.rand(M,K)
19 
20 nP, nQ = matrix_factorization(R, P, Q, K)
21 nR = numpy.dot(nP, nQ.T)
1 nP
array([[ 0.38345373,  2.181972  ],
       [ 0.32334816,  1.56283276],
       [ 1.99170613,  0.16400981],
       [ 1.59666903,  0.14124969],
       [ 1.64308192,  1.07125805]])
nQ
array([[ 0.38946426,  2.29198167],
       [ 0.19720283,  1.18916254],
       [ 1.71589715,  1.76060186],
       [ 2.48314488,  0.03019937]])
1 nR
array([[ 5.15038133,  2.67033753,  4.49955112,  1.01806534],
       [ 3.70791658,  1.92222735,  3.30635845,  0.85011689],
       [ 1.15160585,  0.58780442,  3.70631887,  4.95064787],
       [ 0.94558722,  0.48283649,  2.98840431,  3.96902618],
       [ 3.0952255 ,  1.59792036,  4.70541851,  4.11236178]])
1 R
array([[5, 3, 0, 1],
       [4, 0, 3, 1],
       [1, 1, 0, 5],
       [1, 0, 0, 4],
       [0, 1, 5, 4]])


posted on 2018-03-06 21:59  NothingLZ  阅读(431)  评论(0编辑  收藏  举报

导航