链路预测算法MATLAB实现
链路预测是复杂网络分析中的重要任务,旨在预测网络中尚未连接的两个节点之间未来产生连接的可能性。
程序概述
MATLAB程序实现了以下链路预测算法:
- 基于局部信息的相似性指标(Common Neighbors, Jaccard, Adamic-Adar等)
- 基于路径的相似性指标(Katz指数)
- 基于随机游走的相似性指标(Rooted PageRank, SimRank)
- 矩阵分解方法
代码
classdef LinkPrediction
%LINKPREDICTION 链路预测算法实现
% 包含多种链路预测算法
properties
A; % 邻接矩阵
train_mask; % 训练掩码矩阵
test_mask; % 测试掩码矩阵
methods; % 可用的预测方法
end
methods
function obj = LinkPrediction(adj_matrix, train_ratio)
%LINKPREDICTION 构造函数
% adj_matrix - 网络邻接矩阵
% train_ratio - 训练集比例(0-1)
obj.A = adj_matrix;
obj.methods = {'CommonNeighbors', 'Jaccard', 'AdamicAdar', ...
'PreferentialAttachment', 'Katz', 'RootedPageRank', ...
'SimRank', 'MatrixFactorization'};
% 划分训练集和测试集
if nargin > 1
obj = obj.split_dataset(train_ratio);
else
obj.train_mask = ones(size(obj.A));
obj.test_mask = zeros(size(obj.A));
end
end
function obj = split_dataset(obj, train_ratio)
%SPLIT_DATASET 划分训练集和测试集
% 随机隐藏一部分边作为测试集
[n, ~] = size(obj.A);
obj.train_mask = ones(n);
obj.test_mask = zeros(n);
% 获取所有边的索引
[rows, cols] = find(triu(obj.A, 1)); % 只取上三角避免重复
num_edges = length(rows);
num_train = round(num_edges * train_ratio);
% 随机选择训练边
idx = randperm(num_edges);
train_idx = idx(1:num_train);
test_idx = idx(num_train+1:end);
% 创建掩码矩阵
for i = 1:length(test_idx)
r = rows(test_idx(i));
c = cols(test_idx(i));
obj.train_mask(r, c) = 0;
obj.train_mask(c, r) = 0;
obj.test_mask(r, c) = 1;
obj.test_mask(c, r) = 1;
end
% 确保对角线为0
obj.train_mask(1:n+1:end) = 0;
obj.test_mask(1:n+1:end) = 0;
end
function scores = common_neighbors(obj)
%COMMON_NEIGHBORS 共同邻居算法
scores = (obj.A * obj.A) .* obj.train_mask;
end
function scores = jaccard(obj)
%JACCARD Jaccard相似系数
[n, ~] = size(obj.A);
scores = zeros(n);
for i = 1:n
for j = i+1:n
if obj.train_mask(i, j) == 0
continue;
end
neighbors_i = find(obj.A(i, :));
neighbors_j = find(obj.A(j, :));
intersection = length(intersect(neighbors_i, neighbors_j));
union = length(union(neighbors_i, neighbors_j));
if union > 0
scores(i, j) = intersection / union;
else
scores(i, j) = 0;
end
scores(j, i) = scores(i, j);
end
end
end
function scores = adamic_adar(obj)
%ADAMIC_ADAR Adamic-Adar指数
[n, ~] = size(obj.A);
scores = zeros(n);
% 计算每个节点的度
degrees = sum(obj.A, 2);
for i = 1:n
for j = i+1:n
if obj.train_mask(i, j) == 0
continue;
end
common_neighbors = find(obj.A(i, :) & obj.A(j, :));
score = 0;
for k = common_neighbors
if degrees(k) > 1 % 避免除以0
score = score + 1 / log(degrees(k));
end
end
scores(i, j) = score;
scores(j, i) = score;
end
end
end
function scores = preferential_attachment(obj)
%PREFERENTIAL_ATTACHMENT 优先连接算法
degrees = sum(obj.A, 2);
scores = (degrees * degrees') .* obj.train_mask;
end
function scores = katz(obj, beta)
%KATZ Katz指数
% beta - 衰减因子,默认0.01
if nargin < 2
beta = 0.01;
end
[n, ~] = size(obj.A);
I = eye(n);
scores = beta * obj.A; % 长度为1的路径
% 计算Katz指数:S = βA + β²A² + β³A³ + ...
% 使用矩阵求逆近似:S = (I - βA)^-1 - I
scores = inv(I - beta * obj.A) - I;
scores = scores .* obj.train_mask;
end
function scores = rooted_pagerank(obj, alpha, max_iter, tol)
%ROOTED_PAGERANK Rooted PageRank算法
% alpha - 随机游走概率,默认0.85
% max_iter - 最大迭代次数,默认100
% tol - 收敛容差,默认1e-6
if nargin < 2
alpha = 0.85;
end
if nargin < 3
max_iter = 100;
end
if nargin < 4
tol = 1e-6;
end
[n, ~] = size(obj.A);
scores = zeros(n);
% 创建列随机矩阵(转移概率矩阵)
P = obj.A ./ sum(obj.A, 1);
P(isnan(P)) = 0; % 处理度为0的节点
% 对每个节点计算Rooted PageRank
for i = 1:n
r = zeros(n, 1);
r(i) = 1;
for iter = 1:max_iter
r_new = alpha * P * r + (1 - alpha) * r;
if norm(r_new - r, 1) < tol
break;
end
r = r_new;
end
scores(:, i) = r;
end
scores = scores .* obj.train_mask;
end
function scores = simrank(obj, C, max_iter, tol)
%SIMRANK SimRank算法
% C - 衰减因子,默认0.8
% max_iter - 最大迭代次数,默认10
% tol - 收敛容差,默认1e-4
if nargin < 2
C = 0.8;
end
if nargin < 3
max_iter = 10;
end
if nargin < 4
tol = 1e-4;
end
[n, ~] = size(obj.A);
S = eye(n); % 初始化SimRank矩阵
% 计算入邻居
in_neighbors = cell(n, 1);
for i = 1:n
in_neighbors{i} = find(obj.A(:, i));
end
% 迭代计算SimRank
for iter = 1:max_iter
S_old = S;
for i = 1:n
for j = 1:n
if i == j
S(i, j) = 1;
continue;
end
in_i = in_neighbors{i};
in_j = in_neighbors{j};
if isempty(in_i) || isempty(in_j)
S(i, j) = 0;
continue;
end
sum_sim = 0;
for a = 1:length(in_i)
for b = 1:length(in_j)
sum_sim = sum_sim + S_old(in_i(a), in_j(b));
end
end
S(i, j) = C * sum_sim / (length(in_i) * length(in_j));
end
end
if norm(S - S_old, 'fro') < tol
break;
end
end
scores = S .* obj.train_mask;
end
function scores = matrix_factorization(obj, dim, lambda, max_iter, learning_rate)
%MATRIX_FACTORIZATION 矩阵分解方法
% dim - 潜在特征维度,默认10
% lambda - 正则化参数,默认0.01
% max_iter - 最大迭代次数,默认100
% learning_rate - 学习率,默认0.01
if nargin < 2
dim = 10;
end
if nargin < 3
lambda = 0.01;
end
if nargin < 4
max_iter = 100;
end
if nargin < 5
learning_rate = 0.01;
end
[n, ~] = size(obj.A);
% 初始化用户和物品特征矩阵
U = randn(n, dim) * 0.01;
V = randn(n, dim) * 0.01;
% 获取训练集中的非零元素(即存在的边)
[rows, cols] = find(obj.train_mask);
values = ones(length(rows), 1);
% 随机梯度下降
for iter = 1:max_iter
total_error = 0;
for idx = 1:length(rows)
i = rows(idx);
j = cols(idx);
% 计算预测值和误差
prediction = U(i, :) * V(j, :)';
error = values(idx) - prediction;
total_error = total_error + error^2;
% 更新特征向量
U_i_old = U(i, :);
U(i, :) = U(i, :) + learning_rate * (error * V(j, :) - lambda * U(i, :));
V(j, :) = V(j, :) + learning_rate * (error * U_i_old - lambda * V(j, :));
end
% 添加正则化项
total_error = total_error + lambda * (norm(U, 'fro')^2 + norm(V, 'fro')^2);
if mod(iter, 10) == 0
fprintf('迭代 %d, 误差: %.4f\n', iter, total_error);
end
end
% 计算得分矩阵
scores = U * V';
scores = scores .* obj.train_mask;
end
function [precision, recall, auc] = evaluate(obj, scores, top_k)
%EVALUATE 评估预测结果
% scores - 预测得分矩阵
% top_k - 计算precision@k和recall@k的k值
if nargin < 3
top_k = 100;
end
% 获取测试集中的正样本
[test_rows, test_cols] = find(obj.test_mask);
positive_pairs = [test_rows, test_cols];
num_positives = size(positive_pairs, 1);
% 获取所有未连接的节点对(负样本+测试正样本)
negative_mask = (obj.train_mask == 0) & (obj.A == 0) & (eye(size(obj.A)) == 0);
[negative_rows, negative_cols] = find(negative_mask);
negative_pairs = [negative_rows, negative_cols];
% 随机选择与正样本数量相同的负样本
idx = randperm(size(negative_pairs, 1), num_positives);
negative_pairs = negative_pairs(idx, :);
% 合并正负样本
all_pairs = [positive_pairs; negative_pairs];
labels = [ones(num_positives, 1); zeros(num_positives, 1)];
% 获取预测得分
pred_scores = zeros(size(all_pairs, 1), 1);
for i = 1:size(all_pairs, 1)
pred_scores(i) = scores(all_pairs(i, 1), all_pairs(i, 2));
end
% 计算AUC
[~, ~, ~, auc] = perfcurve(labels, pred_scores, 1);
% 计算Precision@K和Recall@K
% 获取得分最高的top_k个节点对
[~, sorted_idx] = sort(pred_scores(1:num_positives), 'descend');
top_predictions = sorted_idx(1:min(top_k, length(sorted_idx)));
true_positives = sum(ismember(top_predictions, 1:num_positives));
precision = true_positives / top_k;
recall = true_positives / num_positives;
end
function results = compare_methods(obj, methods, top_k)
%COMPARE_METHODS 比较不同算法的性能
% methods - 要比较的方法列表
% top_k - 计算precision@k和recall@k的k值
if nargin < 2
methods = obj.methods;
end
if nargin < 3
top_k = 100;
end
results = struct();
for i = 1:length(methods)
method = methods{i};
fprintf('正在计算 %s...\n', method);
try
% 调用相应的方法
tic;
scores = obj.(lower(method))();
time = toc;
% 评估性能
[precision, recall, auc] = obj.evaluate(scores, top_k);
% 保存结果
results.(method).scores = scores;
results.(method).precision = precision;
results.(method).recall = recall;
results.(method).auc = auc;
results.(method).time = time;
fprintf('%s: Precision@%d=%.4f, Recall@%d=%.4f, AUC=%.4f, 时间=%.2fs\n', ...
method, top_k, precision, top_k, recall, auc, time);
catch ME
fprintf('计算 %s 时出错: %s\n', method, ME.message);
results.(method).error = ME.message;
end
end
end
function plot_results(obj, results)
%PLOT_RESULTS 可视化比较结果
methods = fieldnames(results);
num_methods = length(methods);
precisions = zeros(num_methods, 1);
recalls = zeros(num_methods, 1);
aucs = zeros(num_methods, 1);
times = zeros(num_methods, 1);
for i = 1:num_methods
if isfield(results.(methods{i}), 'error')
continue;
end
precisions(i) = results.(methods{i}).precision;
recalls(i) = results.(methods{i}).recall;
aucs(i) = results.(methods{i}).auc;
times(i) = results.(methods{i}).time;
end
% 创建图形
figure('Position', [100, 100, 1200, 800]);
% 绘制精确度
subplot(2, 2, 1);
bar(precisions);
set(gca, 'XTickLabel', methods, 'XTickLabelRotation', 45);
title('Precision@K');
ylabel('Precision');
grid on;
% 绘制召回率
subplot(2, 2, 2);
bar(recalls);
set(gca, 'XTickLabel', methods, 'XTickLabelRotation', 45);
title('Recall@K');
ylabel('Recall');
grid on;
% 绘制AUC
subplot(2, 2, 3);
bar(aucs);
set(gca, 'XTickLabel', methods, 'XTickLabelRotation', 45);
title('AUC');
ylabel('AUC');
grid on;
% 绘制运行时间
subplot(2, 2, 4);
bar(times);
set(gca, 'XTickLabel', methods, 'XTickLabelRotation', 45);
title('运行时间');
ylabel('时间 (秒)');
grid on;
% 调整布局
set(gcf, 'Color', 'w');
end
end
end
% 示例使用代码
function example_usage()
% 生成示例网络(无标度网络)
n = 100; % 节点数量
A = create_scale_free_network(n);
% 创建链路预测对象,使用80%的边作为训练集
lp = LinkPrediction(A, 0.8);
% 比较所有方法的性能
results = lp.compare_methods();
% 可视化结果
lp.plot_results(results);
% 单独使用某个方法
scores = lp.common_neighbors();
[precision, recall, auc] = lp.evaluate(scores);
fprintf('\nCommon Neighbors: Precision=%.4f, Recall=%.4f, AUC=%.4f\n', precision, recall, auc);
end
function A = create_scale_free_network(n)
%CREATE_SCALE_FREE_NETWORK 生成无标度网络(Barabási-Albert模型)
% n - 网络节点数
% 初始完全图
m0 = 5; % 初始节点数
A = zeros(n);
A(1:m0, 1:m0) = ones(m0) - eye(m0);
% 添加新节点
for i = m0+1:n
% 计算现有节点的度
degrees = sum(A(1:i-1, 1:i-1), 2);
total_degree = sum(degrees);
% 根据度分布选择连接节点
if total_degree > 0
prob = degrees / total_degree;
targets = randsample(1:i-1, m0, true, prob);
else
targets = randperm(i-1, min(m0, i-1));
end
% 添加连接
for j = targets
A(i, j) = 1;
A(j, i) = 1;
end
end
end
% 运行示例
example_usage();
说明
这个MATLAB链路预测程序提供了以下功能:
1. 核心类 LinkPrediction
包含多种链路预测算法的实现,以及评估和比较功能。
2. 实现的算法
- Common Neighbors (共同邻居):基于两个节点共同邻居的数量
- Jaccard Coefficient:共同邻居数除以总邻居数
- Adamic-Adar:考虑共同邻居的度,度越小权重越大
- Preferential Attachment:基于两个节点的度乘积
- Katz Index:考虑所有路径,路径越短权重越大
- Rooted PageRank:基于随机游走的相似性度量
- SimRank:基于结构上下文的相似性度量
- Matrix Factorization:基于矩阵分解的潜在特征方法
3. 评估指标
- Precision@K:前K个预测中正确预测的比例
- Recall@K:正确预测的正样本占所有正样本的比例
- AUC:ROC曲线下面积,衡量分类器整体性能
4. 可视化功能
提供四种评估指标的可视化比较,便于分析不同算法的性能。
推荐代码 链路预测程序,主程序,包含31个链路预测的函数代码 www.youwenfan.com/contentcno/52463.html
使用
程序最后提供了一个示例使用代码:
- 生成一个无标度网络(Barabási-Albert模型)
- 创建链路预测对象,划分训练集和测试集
- 比较所有算法的性能
- 可视化比较结果
- 单独使用Common Neighbors算法并进行评估
浙公网安备 33010602011771号