协同过滤

协同过滤

import numpy as np
import pandas as pd
from sklearn import model_selection as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
u_data_path="ml-100k/"
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(u_data_path+'ml-100k/u.data', sep='\t', names=header)
print(df.head(5))
print(len(df))
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
100000
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print (f"{n_users=}, {n_items=}")

train_data, val_data = cv.train_test_split(df, test_size=0.3)

train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

val_data_matrix = np.zeros((n_users, n_items))
for line in val_data.itertuples():
    val_data_matrix[line[1]-1, line[2]-1] = line[3]

print(f"{train_data_matrix=}\n{val_data_matrix=}")
n_users=943, n_items=1682
train_data_matrix=array([[5., 3., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])
val_data_matrix=array([[0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
print(f"{user_similarity=}")
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
print(f"{item_similarity=}")
print(f"zeros' size of user_similarity: {np.sum(np.where(user_similarity,0,1))/user_similarity.shape[0]/user_similarity.shape[0]}, zeros' size of item_similarity: {np.sum(np.where(item_similarity,0,1))/item_similarity.shape[0]/item_similarity.shape[0]}")
user_similarity=array([[0.        , 0.85946611, 0.97623132, ..., 0.912421  , 0.83468728,
        0.75714089],
       [0.85946611, 0.        , 0.87176212, ..., 0.95389244, 0.81411415,
        0.93503191],
       [0.97623132, 0.87176212, 0.        , ..., 0.89444313, 0.86830174,
        0.96384902],
       ...,
       [0.912421  , 0.95389244, 0.89444313, ..., 0.        , 0.89559801,
        0.90513797],
       [0.83468728, 0.81411415, 0.86830174, ..., 0.89559801, 0.        ,
        0.81381622],
       [0.75714089, 0.93503191, 0.96384902, ..., 0.90513797, 0.81381622,
        0.        ]])
item_similarity=array([[0.        , 0.67188539, 0.75969353, ..., 1.        , 0.94344842,
        0.94344842],
       [0.67188539, 0.        , 0.84295933, ..., 1.        , 1.        ,
        0.91027771],
       [0.75969353, 0.84295933, 0.        , ..., 1.        , 1.        ,
        0.88331352],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.94344842, 1.        , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.94344842, 0.91027771, 0.88331352, ..., 1.        , 1.        ,
        0.        ]])
zeros' size of user_similarity: 0.0010604453870625664, zeros' size of item_similarity: 0.000935271836794711
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred


item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

print(f"{item_prediction=}\n{user_prediction=}")
item_prediction=array([[0.36091292, 0.3742617 , 0.38494287, ..., 0.43158962, 0.42346516,
        0.42039778],
       [0.08167712, 0.09075145, 0.08600903, ..., 0.08923257, 0.09113098,
        0.09096535],
       [0.06346002, 0.06674331, 0.06492096, ..., 0.06346676, 0.0663372 ,
        0.06718155],
       ...,
       [0.03362802, 0.03958481, 0.03741362, ..., 0.04340328, 0.04272092,
        0.04321106],
       [0.12511568, 0.13119797, 0.13956135, ..., 0.14389357, 0.14363073,
        0.14507761],
       [0.20003409, 0.19662323, 0.21587546, ..., 0.24689373, 0.23801457,
        0.24087827]])
user_prediction=array([[ 1.49118886,  0.56424723,  0.45892075, ...,  0.29009607,
         0.29016588,  0.29002192],
       [ 1.24527824,  0.27602048,  0.12842234, ..., -0.06542708,
        -0.06395199, -0.06396041],
       [ 1.23296543,  0.24559601,  0.10770214, ..., -0.09013771,
        -0.08848755, -0.08837765],
       ...,
       [ 1.14901453,  0.21487944,  0.07405416, ..., -0.11016096,
        -0.10926592, -0.10912868],
       [ 1.27302822,  0.30513927,  0.18909724, ..., -0.00373816,
        -0.00286227, -0.00259623],
       [ 1.3163228 ,  0.37116774,  0.27425577, ...,  0.10388962,
         0.10381974,  0.10414054]])
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print ('User-based CF RMSE: ' + str(rmse(user_prediction, val_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, val_data_matrix)))
User-based CF RMSE: 3.1664352347602613
Item-based CF RMSE: 3.4720042185951625
posted @ 2023-03-17 09:47  孑然520  阅读(47)  评论(0)    收藏  举报