推荐算法-协同过滤
最近看了一篇协同过滤的文章"A Guide to Singular Value Decomp osition for Collaborative Filtering",主要为协同过滤设计了一种有效的svd算法,
V是偏好分数矩阵,I{i,j}=1代表用户i对item j 有偏好,否则为I{i, j}=0,一般V是稀疏的。协同过滤的目的是预测稀疏矩阵中没有展现的评论分数,协同过滤算法一种普遍的评价方法是Root Mean Square Error(RMSE),预测矩阵P,真实矩阵为A,且J为预测的A的指示器,对应I的定义,则RMSE定义为:

SVD算法的目的是找到两个特征矩阵U(用户*特征) ,M(item*特征)
代码如下:
#include<iostream>
#include<string>
#include<fstream>
#include<math.h>
using namespace std;
const int USERMAX = 1000;
const int ITEMMAX = 2000;
const int FEATURE = 50;
const int ITER_MAX = 30;
double rating[USERMAX][ITEMMAX];
int I[USERMAX][ITEMMAX]; //indicate if the item is rated
double UserF[USERMAX][FEATURE];
double ItemF[ITEMMAX][FEATURE];
double BIASU[USERMAX];
double BIASI[ITEMMAX];
double lambda = 0.15;
double gamma = 0.05;
double mean;
double predict(int i, int j) {
double rate = mean + BIASU[i] + BIASI[j];
for(int f = 0; f < FEATURE; f++) {
rate += UserF[i][f] * ItemF[j][f];
}
if(rate < 1) {
rate = 1;
} else if (rate > 5) {
rate = 5;
}
return rate;
}
double calRMSE()
{
double total = 0;
int cnt = 0;
for(int i = 0; i < USERMAX; i++) {
for(int j = 0; j < ITEMMAX; j++) {
double rate = predict(i, j);
total += I[i][j] * (rating[i][j] - rate)* (rating[i][j] - rate);
cnt += I[i][j];
}
}
double rmse = pow(total/cnt, 0.5);
return rmse;
}
double calMean()
{
double total = 0;
int cnt = 0;
for (int i = 0; i < USERMAX; ++i) {
for(int j = 0; j < ITEMMAX; ++j) {
total += I[i][j] * rating[i][j];
cnt += I[i][j];
}
}
return total/cnt;
}
void initBias()
{
memset(BIASU, 0, sizeof(BIASU));
memset(BIASI, 0, sizeof(BIASI));
mean = calMean();
for(int i = 0; i < USERMAX; i++) {
double total = 0;
int cnt = 0;
for(int j = 0; j < ITEMMAX; j++) {
if(I[i][j]) {
total += rating[i][j] - mean;
cnt++;
}
}
if(cnt > 0) {
BIASU[i] = total/cnt;
} else {
BIASU[i] = 0;
}
}
for(int j = 0; j < ITEMMAX; j++) {
double total = 0;
int cnt = 0;
for(int i = 0; i < USERMAX; i++) {
if(I[i][j]) {
total += rating[i][j] - mean;
cnt++;
}
}
if(cnt > 0) {
BIASI[j] = total/cnt;
} else {
BIASI[j] = 0;
}
}
}
void train()
{
memset(rating, 0, sizeof(rating));
memset(I, 0, sizeof(I));
ifstream in("D:\\dataset\\ml-100k\\ub.base",ios::in);
if(!in) {
cout << "file not exist" << endl;
exit(1);
}
int userId, itemId, rate;
string timeStamp;
while(in >> userId >> itemId >> rate >> timeStamp) {
rating[userId][itemId] = rate;
I[userId][itemId] = 1;
}
initBias();
//train matrix decomposation
/*for (int i = 0; i < USERMAX; ++i) {
for(int f = 0; i < FEATURE; i++) {
UserF[i][f] = (rand()%10)/10.0;
}
}
for (int j = 0; j < ITEMMAX; ++j) {
for (int f = 0; f < FEATURE; ++f) {
ItemF[j][f] = (rand()%10)/10.0;
}
}*/
//初始化
for(int f = 0; f < FEATURE; f++) {
for(int i = 0; i < USERMAX; i++) {
UserF[i][f] = (rand()%100)/100.0 + 0.001;
}
for(int j = 0; j < ITEMMAX; j++) {
ItemF[j][f] = (rand()%100)/100.0 + 0.001;
}
}
int iterCnt = 0;
while(iterCnt < ITER_MAX) {
for(int i = 0; i < USERMAX; i++) {
for(int j = 0; j < ITEMMAX;j++) {
if(I[i][j]) {
double predictRate = predict(i, j);
double eui = rating[i][j] - predictRate;
BIASU[i] += gamma*(eui - lambda*BIASU[i]);
BIASI[j] += gamma*(eui - lambda*BIASI[j]);
for(int f = 0; f < FEATURE; f++) {
UserF[i][f] += gamma*(eui*ItemF[j][f] - lambda*UserF[i][f]);
ItemF[j][f] += gamma*(eui*UserF[i][f] - lambda*ItemF[j][f]);
}
}
}
}
double rmse = calRMSE();
cout << "LOOP" << iterCnt << ": rmse is " << rmse << endl;
iterCnt++;
}
}
void test()
{
ifstream in("D:\\dataset\\ml-100k\\ub.test");
int userId, itemId, rate;
string timeStamp;
double total = 0;
int cnt = 0;
while(in >> userId >> itemId >> rate >> timeStamp) {
double predictRate = predict(userId, itemId);
total += (rate - predictRate) * (rate - predictRate);
cnt++;
}
double rmse = pow(total / cnt, 0.5);
cout << "test: rmse is " << rmse << endl;
}
int main()
{
train();
test();
return 0;
}

浙公网安备 33010602011771号