协同过滤
import numpy as np
import pandas as pd
from sklearn import model_selection as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
u_data_path="ml-100k/"
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(u_data_path+'ml-100k/u.data', sep='\t', names=header)
print(df.head(5))
print(len(df))
user_id item_id rating timestamp
0 196 242 3 881250949
1 186 302 3 891717742
2 22 377 1 878887116
3 244 51 2 880606923
4 166 346 1 886397596
100000
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print (f"{n_users=}, {n_items=}")
train_data, val_data = cv.train_test_split(df, test_size=0.3)
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
train_data_matrix[line[1]-1, line[2]-1] = line[3]
val_data_matrix = np.zeros((n_users, n_items))
for line in val_data.itertuples():
val_data_matrix[line[1]-1, line[2]-1] = line[3]
print(f"{train_data_matrix=}\n{val_data_matrix=}")
n_users=943, n_items=1682
train_data_matrix=array([[5., 3., 4., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 5., 0., ..., 0., 0., 0.]])
val_data_matrix=array([[0., 0., 0., ..., 0., 0., 0.],
[4., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[5., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
print(f"{user_similarity=}")
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
print(f"{item_similarity=}")
print(f"zeros' size of user_similarity: {np.sum(np.where(user_similarity,0,1))/user_similarity.shape[0]/user_similarity.shape[0]}, zeros' size of item_similarity: {np.sum(np.where(item_similarity,0,1))/item_similarity.shape[0]/item_similarity.shape[0]}")
user_similarity=array([[0. , 0.85946611, 0.97623132, ..., 0.912421 , 0.83468728,
0.75714089],
[0.85946611, 0. , 0.87176212, ..., 0.95389244, 0.81411415,
0.93503191],
[0.97623132, 0.87176212, 0. , ..., 0.89444313, 0.86830174,
0.96384902],
...,
[0.912421 , 0.95389244, 0.89444313, ..., 0. , 0.89559801,
0.90513797],
[0.83468728, 0.81411415, 0.86830174, ..., 0.89559801, 0. ,
0.81381622],
[0.75714089, 0.93503191, 0.96384902, ..., 0.90513797, 0.81381622,
0. ]])
item_similarity=array([[0. , 0.67188539, 0.75969353, ..., 1. , 0.94344842,
0.94344842],
[0.67188539, 0. , 0.84295933, ..., 1. , 1. ,
0.91027771],
[0.75969353, 0.84295933, 0. , ..., 1. , 1. ,
0.88331352],
...,
[1. , 1. , 1. , ..., 0. , 1. ,
1. ],
[0.94344842, 1. , 1. , ..., 1. , 0. ,
1. ],
[0.94344842, 0.91027771, 0.88331352, ..., 1. , 1. ,
0. ]])
zeros' size of user_similarity: 0.0010604453870625664, zeros' size of item_similarity: 0.000935271836794711
def predict(ratings, similarity, type='user'):
if type == 'user':
mean_user_rating = ratings.mean(axis=1)
ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
elif type == 'item':
pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
return pred
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
print(f"{item_prediction=}\n{user_prediction=}")
item_prediction=array([[0.36091292, 0.3742617 , 0.38494287, ..., 0.43158962, 0.42346516,
0.42039778],
[0.08167712, 0.09075145, 0.08600903, ..., 0.08923257, 0.09113098,
0.09096535],
[0.06346002, 0.06674331, 0.06492096, ..., 0.06346676, 0.0663372 ,
0.06718155],
...,
[0.03362802, 0.03958481, 0.03741362, ..., 0.04340328, 0.04272092,
0.04321106],
[0.12511568, 0.13119797, 0.13956135, ..., 0.14389357, 0.14363073,
0.14507761],
[0.20003409, 0.19662323, 0.21587546, ..., 0.24689373, 0.23801457,
0.24087827]])
user_prediction=array([[ 1.49118886, 0.56424723, 0.45892075, ..., 0.29009607,
0.29016588, 0.29002192],
[ 1.24527824, 0.27602048, 0.12842234, ..., -0.06542708,
-0.06395199, -0.06396041],
[ 1.23296543, 0.24559601, 0.10770214, ..., -0.09013771,
-0.08848755, -0.08837765],
...,
[ 1.14901453, 0.21487944, 0.07405416, ..., -0.11016096,
-0.10926592, -0.10912868],
[ 1.27302822, 0.30513927, 0.18909724, ..., -0.00373816,
-0.00286227, -0.00259623],
[ 1.3163228 , 0.37116774, 0.27425577, ..., 0.10388962,
0.10381974, 0.10414054]])
def rmse(prediction, ground_truth):
prediction = prediction[ground_truth.nonzero()].flatten()
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
return sqrt(mean_squared_error(prediction, ground_truth))
print ('User-based CF RMSE: ' + str(rmse(user_prediction, val_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, val_data_matrix)))
User-based CF RMSE: 3.1664352347602613
Item-based CF RMSE: 3.4720042185951625