# 协同滤波 Collaborative filtering 《推荐系统实践》 第二章

### 利用用户行为数据

1显性反馈：包括用户明确表示对物品喜好的行为（数据量小）

2隐形反馈：网页浏览等（数据量大）

#### 实验数据说明

U.data数据包含4列，分别是  UserID::MovieID::Rating::Time  ,本实验关心的是topN推荐，所以只关心用户是否看了某个电影，而不关心用户对电影的评分和看电影的时间。所以取数据前两列。

def SplitData(data,M=8,k=3,seed=1):
test = {}
train = {}
random.seed(seed)

for user, item in data:
if random.randint(0,M) ==k:
if user not in test:
test[user]=set()
else:
if user not in train:
train[user]=set()
return train,test

#### 评测指标：

def Recall(train,test,N):
hit=0
alls=0
W=UserSimilarity2(train)

for user  in train.keys():
try:#有可能test有没user看过的item
te_user_item = test[user]

recomRank = Recommend(user,train,W,N)

for  recom_item,w in recomRank:
if recom_item in te_user_item:
hit+=1
alls+=len(te_user_item)
except:
pass;

return hit*1.0/alls

#ItemFC_recall
def ItemRecall(train,test,N):
hit=0
alls=0
W=ItemSimilarity(train)

for user  in train.keys():
try:#有可能test有没user看过的item
te_user_item = test[user]
recomRank = ItemRecommendation(user,train,W,N)
#pdb.set_trace()
for  recom_item,w in recomRank:
if recom_item in te_user_item:
hit+=1
alls+=len(te_user_item)
except:
pass;

return hit*1.0/alls

def Precision(train,test,N):
hit=0
alls=0
W=UserSimilarity2(train)
for user  in train.keys():
try:#有个能test有没user看过的item
te_user_item = test[user]
recomRank = Recommend(user,train,W,N)
#pdb.set_trace()
for  recom_item,w in recomRank:
if recom_item in te_user_item:
hit+=1
alls+=N
except:
pass

return hit*1.0/alls

def ItemPrecision(train,test,N):
hit=0
alls=0
W=ItemSimilarity(train)

for user  in train.keys():
try:#有可能test有没user看过的item
te_user_item = test[user]
recomRank = ItemRecommendation(user,train,W,N)
#pdb.set_trace()
for  recom_item,w in recomRank:
if recom_item in te_user_item:
hit+=1
alls+=N
except:
pass;

return hit*1.0/alls

def Coverage(train,N):
recommend_items = set()
all_items = set()
W=UserSimilarity2(train)
for user in train.keys():
for item in train[user]:

rank =Recommend(user,train,W,N)

for item in rank[0]:

return len(recommend_items)/(len(all_items)*1.0)

def ItemCoverage(train,N):
recommend_items = set()
all_items = set()
W=ItemSimilarity(train)

for user in train.keys():
for item in train[user]:

rank =ItemRecommendation(user,train,W,N)

for item in rank[0]:

return len(recommend_items)/(len(all_items)*1.0)

#### 基于用户的协同滤波User_CF(Collaborative filtering):

1）找到和目标永和兴趣相似的用户集合

2）找到这个集合中的用户喜欢的，且目标用户没有听说过的物品推荐给用户

##### 基于物品的协同滤波Item_CF

1）计算物品之间的相似度

2）根据物品的像吸毒和用户的历史行为给用户生成推荐列表

N(i)若代表喜欢物品i的用户数目，则物品i和j相似度可以用下面的公式表示：

USER_CF,ITEM_CF计算物品i,j相似度的代码如下：

def UserSimilarity2(train,flag=1):
#第二中计算W的函数
item_users = dict() #bulid an new empty dicitionary
for u ,item in train.items():
for i in item:
if i not in item_users:
item_users[i] = set() #生成一个集合

C = dict()
N = dict()#N[u]表示拥护u的项目(看电影)个数

for item ,users in item_users.items():

for u in users:
if u not in N:
N[u]=1#如果用户u不在字典N里面，先创建
else:
N[u]+=1
for v in users :
if u!=v:
if flag==0:#正常情况
if (u,v) not in C:
C[(u,v)]=1
else:
C[(u,v)]+=1
elif flag==1:
if (u,v) not in C:
C[(u,v)]=1/log(1+len(users))
else:
C[(u,v)]+=1/log(1+len(users))

W = dict()

for uv in C.keys():
#pdb.set_trace()
u=uv[0]
v=uv[1]
if u not in W:
W[u]=set()
#添加与用户u相关的用户v,第二个意思是他们的权重Wuv

return W

def ItemSimilarity(train):
C = dict() #记录 N(i)并N(j)
N = dict() #记录 N(i) i表示喜欢物品i的用户数

for u , items in train.items():
for i in items:
if i not in N:
N[i]=1
else:
N[i]+=1

for j in items:
if i != j:
if (i,j) not in C:

C[(i,j)]=1
else:
C[(i,j)]+=1

#calculate finial similarity:
W= dict()

for ij ,val in C.items():

i=ij[0]#物品i
j=ij[1]#物品j

if i not in W:
W[i]=set()

return W

userCF ,Item CF 推荐topN代码如下：

def Recommend(user,train,W,N,K=20):

rank = dict()
interacted_items = train[user]

for v,wuv in sorted(W[user], key=lambda x:x[1],reverse=True)[0:K]:

for i  in train[v]:#v看过的电影
if i not in interacted_items:#如果电影i不在user已看过的电影里
if i not in rank:
rank[i]=wuv * 1
else:
rank[i]+=wuv * 1

rank=sorted(rank.items(), key = lambda x:x[1],reverse=True)

#  rank=[(key,val) for key,val in rank.items()]#字典转换为list
rank=rank[:N]
return rank

def ItemRecommendation(user,train,W,N,K=10):
rank = dict()

user_items =train[user]
for i in user_items:
for j , wij in sorted(W[i], key = lambda x:x[1],reverse =True)[0:K]:
if j not in user_items:
if j not in rank:
rank[j] = wij*1
else:
rank[j]+=wij*1

rank=sorted(rank.items(), key = lambda x:x[1],reverse=True)

rank=rank[:N]
return rank

ItemCoverage:  0.601796407186
ItemRecall: 0.172728085068
ItemPrecision: 0.208972972973
Recall  0.165132695916
Precision  0.199783783784
Coverage 0.698203592814

# -*- coding: utf-8 -*-
'''

Created on 2014��4��16��

'''
import random
import pdb
from math import *
import traceback

def SplitData(data,M=8,k=3,seed=1):
test = {}
train = {}
random.seed(seed)

for user, item in data:
if random.randint(0,M) ==k:
if user not in test:
test[user]=set()
else:
if user not in train:
train[user]=set()
return train,test

#USER_FC_recall

def Recall(train,test,N):
hit=0
alls=0
W=UserSimilarity2(train)

for user  in train.keys():
try:#有可能test有没user看过的item
te_user_item = test[user]

recomRank = Recommend(user,train,W,N)

for  recom_item,w in recomRank:
if recom_item in te_user_item:
hit+=1
alls+=len(te_user_item)
except:
pass;

return hit*1.0/alls

#ItemFC_recall
def ItemRecall(train,test,N):
hit=0
alls=0
W=ItemSimilarity(train)

for user  in train.keys():
try:#有可能test有没user看过的item
te_user_item = test[user]
recomRank = ItemRecommendation(user,train,W,N)
#pdb.set_trace()
for  recom_item,w in recomRank:
if recom_item in te_user_item:
hit+=1
alls+=len(te_user_item)
except:
pass;

return hit*1.0/alls

# pdb.set_trace()

def Precision(train,test,N):
hit=0
alls=0
W=UserSimilarity2(train)
for user  in train.keys():
try:#有个能test有没user看过的item
te_user_item = test[user]
recomRank = Recommend(user,train,W,N)
#pdb.set_trace()
for  recom_item,w in recomRank:
if recom_item in te_user_item:
hit+=1
alls+=N
except:
pass

return hit*1.0/alls

def ItemPrecision(train,test,N):
hit=0
alls=0
W=ItemSimilarity(train)

for user  in train.keys():
try:#有可能test有没user看过的item
te_user_item = test[user]
recomRank = ItemRecommendation(user,train,W,N)
#pdb.set_trace()
for  recom_item,w in recomRank:
if recom_item in te_user_item:
hit+=1
alls+=N
except:
pass;

return hit*1.0/alls

#计算覆盖率
#USER_CF
def Coverage(train,N):
recommend_items = set()
all_items = set()
W=UserSimilarity2(train)
for user in train.keys():
for item in train[user]:

rank =Recommend(user,train,W,N)

for item in rank[0]:

return len(recommend_items)/(len(all_items)*1.0)

def ItemCoverage(train,N):
recommend_items = set()
all_items = set()
W=ItemSimilarity(train)

for user in train.keys():
for item in train[user]:

rank =ItemRecommendation(user,train,W,N)

for item in rank[0]:

return len(recommend_items)/(len(all_items)*1.0)

def UserSimilarity2(train,flag=1):
#第二中计算W的函数
item_users = dict() #bulid an new empty dicitionary
for u ,item in train.items():
for i in item:
if i not in item_users:
item_users[i] = set() #生成一个集合

C = dict()
N = dict()#N[u]表示拥护u的项目(看电影)个数

for item ,users in item_users.items():

for u in users:
if u not in N:
N[u]=1#如果用户u不在字典N里面，先创建
else:
N[u]+=1
for v in users :
if u!=v:
if flag==0:#正常情况
if (u,v) not in C:
C[(u,v)]=1
else:
C[(u,v)]+=1
elif flag==1:
if (u,v) not in C:
C[(u,v)]=1/log(1+len(users))
else:
C[(u,v)]+=1/log(1+len(users))

W = dict()

for uv in C.keys():
#pdb.set_trace()
u=uv[0]
v=uv[1]
if u not in W:
W[u]=set()
#添加与用户u相关的用户v,第二个意思是他们的权重Wuv

return W

def ItemSimilarity(train):
C = dict() #记录 N(i)并N(j)
N = dict() #记录 N(i) i表示喜欢物品i的用户数

for u , items in train.items():
for i in items:
if i not in N:
N[i]=1
else:
N[i]+=1

for j in items:
if i != j:
if (i,j) not in C:

C[(i,j)]=1
else:
C[(i,j)]+=1

#calculate finial similarity:
W= dict()

for ij ,val in C.items():

i=ij[0]#物品i
j=ij[1]#物品j

if i not in W:
W[i]=set()

return W

#给出要推荐的物品item,(并且存储于rank中)
#rank是一个字典，rank[item]=推荐力度
#返回前N个推荐
def Recommend(user,train,W,N,K=10):

rank = dict()
interacted_items = train[user]

for v,wuv in sorted(W[user], key=lambda x:x[1],reverse=True)[0:K]:

for i  in train[v]:#v看过的电影
if i not in interacted_items:#如果电影i不在user已看过的电影里
if i not in rank:
rank[i]=wuv * 1
else:
rank[i]+=wuv * 1

rank=sorted(rank.items(), key = lambda x:x[1],reverse=True)

#  rank=[(key,val) for key,val in rank.items()]#字典转换为list
rank=rank[:N]
return rank

def ItemRecommendation(user,train,W,N,K=10):
rank = dict()

user_items =train[user]
for i in user_items:
for j , wij in sorted(W[i], key = lambda x:x[1],reverse =True)[0:K]:
if j not in user_items:
if j not in rank:
rank[j] = wij*1
else:
rank[j]+=wij*1

rank=sorted(rank.items(), key = lambda x:x[1],reverse=True)

rank=rank[:N]
return rank

f = open('u.data')

data=[]#存储数据
for line in f:

data.append(line.split('\t')[:2])

train,test=SplitData(data)

print 'ItemCoverage:  %s' % ItemCoverage(train,10)
print 'ItemRecall: %s' % ItemRecall(train,test,10)
print 'ItemPrecision: %s' %   ItemPrecision(train,test,10)

print 'Recall  %s' %  Recall(train,test,10)
print 'Precision  %s' %Precision(train,test,10)
print 'Coverage %s' % Coverage(train,10)

## 总结：

### 参考书目：推荐系统实践

posted @ 2014-04-16 16:29  joey周琦  阅读(2031)  评论(0编辑  收藏