CF算法

generate_train_data.py

import pandas as pd
import user_cf
import operator
import item_cf

data_path = 'G:\\Bigdata_object\\u.data'

udata = pd.read_csv(data_path,
                    sep='\t',
                    header=None,
                    names=['user_id', 'item_id', 'rating', 'timestamp'])

train = dict()
# for _,row in udata.iloc[:2,:].iterrows():
for _, row in udata.iterrows():
    user_id = str(row['user_id'])
    item_id = str(row['item_id'])
    rating = row['rating']
    if train.get(user_id, -1) == -1:
        train[user_id] = dict()
    train[user_id][item_id] = rating

# ###################user_cf test###################

# # print(train)
W = user_cf.user_similarity(train)
# # print(sorted(W.get('1').items(), key=operator.itemgetter(1), reverse=True)[:10])
#
rec_item_list = user_cf.recommend('1', train, W, 10)
print(sorted(rec_item_list.items(), key=operator.itemgetter(1), reverse=True)[:20])


# ###################item_cf test###################
W2 = item_cf.item_similarity(train)
item_list = item_cf.recommend(train,'1',W2,10)
print(sorted(item_list.items(), key=operator.itemgetter(1), reverse=True)[:20])

item_cf.py

import math
import operator


def item_similarity(train):
    # 计算item1与item2相同的user的数量
    C = dict()  # 存item与item相同user的个数 分子
    N = dict()  # item的用户数量 分母
    for u, items in train.items():
        for i in items:
            if N.get(i, -1) == -1:
                N[i] = 0
            N[i] += 1
            if C.get(i, -1) == -1:
                C[i] = dict()
            for j in items:
                if i == j:
                    continue
                elif C[i].get(j, -1) == -1:
                    C[i][j] = 0
                C[i][j] += 1
    # 加分母计算相似度
    W = dict()
    for i, related_items in C.items():
        if W.get(i, -1) == -1:
            W[i] = dict()
        for j, cij in related_items.items():
            if W[i].get(j, -1) == -1:
                W[i][j] = 0
            W[i][j] += cij / math.sqrt(N[i] * N[j])
    return W


def recommend(train, user, w, k):
    rank = dict()
    ru = train[user]
    for i, pi in ru.items():
        for j, wj in sorted(w[i].items(),
                            key=operator.itemgetter(1),
                            reverse=True)[0:k]:
            if j in ru:
                continue
            elif rank.get(j, -1) == -1:
                rank[j] = 0
            rank[j] += pi * wj
    return rank

 

user_cf.py

import operator
import math


# train 格式 :{user:{item:rating}}


def user_similarity(train):
    # 建立item->users倒排表
    item_users = dict()
    for u, items in train.items():
        for i in items.keys():
            if i not in item_users:
                item_users[i] = set()
            item_users[i].add(u)

    # 计算相似user共同的物品数量
    C = dict()  # 共同用户之间相同物品的数量  交集
    N = dict()  # 存储每个用户拥有的Item数量  分母
    for i, users in item_users.items():
        for u in users:
            if N.get(u, -1) == -1:
                N[u] = 0
            N[u] += 1
            if C.get(u, -1) == -1:
                C[u] = dict()
            for v in users:
                if u == v:
                    continue
                elif C[u].get(v, -1) == -1:
                    C[u][v] = 0
                C[u][v] += 1
                # C[u][v] += 1 / math.log(1 + len(users))
    # 得到最终的相似度矩阵W
    W = dict()
    for u, related_users in C.items():
        if W.get(u, -1) == -1:
            W[u] = dict()
        for v, cuv in related_users.items():
            W[u][v] = cuv / math.sqrt(N[u] * N[v] * 1.0)
    return W


# 相似用户的物品集合
def recommend(user, train, w, k):
    rank = dict()
    interacted_items = train[user].keys()
    for v, wuv in sorted(w[user].items(),
                         key=operator.itemgetter(1),
                         reverse=True)[0:k]:
        for i, rvi in train[v].items():
            if i in interacted_items:  # 过滤已经做过评价的电影
                continue
            elif rank.get(i, -1) == -1:
                rank[i] = 0
            rank[i] += wuv * rvi
    return rank

 

posted @ 2020-08-02 20:24  Simon92  阅读(309)  评论(0编辑  收藏  举报