"The PageRank Citation Ranking: Bringing Order to the Web"读后感+单线程pagerank算法实现


1. 论文原文


The PageRank Citation Ranking: Bringing Order to the Web.


Page, Lawrence and Brin, Sergey and Motwani, Rajeev and Winograd, Terry (1999) The PageRank Citation Ranking: Bringing Order to the Web. Technical Report. Stanford InfoLab.




2. PageRank原理介绍


1) pagerank要解决的问题




2) pagerank算法基本思想






3) pagerank具体形式








从这个定义可以看出 2 )中所叙述的递归概念。幸运的是我们可以证明在给定一个不退化初始值时,这个迭代过程是收敛的。




a) 网页没有出链的情况


论文中把这种链接称为“dangling links”,这种链接还存在两种可能:一是这个网页确实没有外链(出链),比如pdf文档;二是下载的网页数目有限,导致一些外链没有被下载。处理这种链接的方式是在计算pagerank是去掉这样的链接,注意是迭代的去掉,因为可能有的网页链接去掉之后,指向这个页面的网页也将成为dangling link,当计算完pagerank后,再根据指向其的页面pagerank恢复计算即可,当然,同样也是迭代计算。论文的作者表示这样的操作不会对结果有很大的影响。


b) 网页存在循环指向的子集合






图1 soso和google的rank sink




图2 baidu的rank sink


在图1sosogoogle形成了循环指向,并且只有入链,没有出链;在图2中,baidu自己形成了这样的循环指向,这样的现象论文中成为rank sinkRank sink将影响模型迭代,只积累权值而不释放。解决这个问题的方法是引入一个排名源:






4)  随机游走模型


将互联网网页看成是一个大的图结构,那么用户的访问顺序是沿着外链完全随机的方式访问,但是以的概率分布用户不沿着链接访问而是随机的选取一个页面(用户gets bored),这样当遇到rank sink的时候可以不一直的永远停留在循环里面。这样的方式从用户访问的角度解释了的作用,更加符合用户的客观行为。后面可以看到,除了有防止rank sink的功能,还有别的作用,即个性化的pagerank


5) 收敛性质


Pagerank迭代收敛在3.22亿的link集合上用了52次,而在这一半的数据上是45次,作者认为互联网的图结构的收敛速度大约线性于logn。作者用expander-like graph解释了收敛速度的原因。


6) pagerank的应用


a) 评估web流量


b) 反向链接的预测


c) 用户导航


d) 其他:比如反向链排序


3. 读后感


1)  pagerank算法的形式是简单的,经典就是经典,这就是吴军博士说的数学之美吧。


2)  在pagerank出现之前,其实业界的学者早就注意到了网页链接对于网页排名的重要性,并试图通过各种方式应用其中,但总体不成系统,没有找到真正的规律。


3)  论文中作者说提出了pagerank理论,然后实现了一个基于pagerank算法的名叫google的搜索引擎,当时读到这,简直难以抑制激动,读过很多论文里面介绍完理论都会实现自己的一个小系统,没想到真的能有系统能够在日后如此闪耀。我想在当时他们二位也不会预料到google能有以后这样的成功吧。但是不得不佩服二位作者的理论功夫,同时倚仗良好的商业天赋,并抓住了互联网革命的大好机会,最终成就了今天的google





  1 /*Copyright (c) 2013.
  2 * All Rights Reserved.
  3 * Author: laohaizi
  4 */
  6 #include <iostream>
  7 #include <string>
  8 #include <fstream>
  9 #include <iomanip>
 10 #include <vector>
 11 #include <map>
 12 #include <set>
 13 #include <stdlib.h>
 14 #include <stdio.h>
 16 using namespace std;
 18 #define DAMP_FACTOR 0.85 //dangming factor 
 19 #define PAGERANK_THRESHOLD 1e-5
 21 //page info structure
 22 struct PAGE_INFO
 23 {
 24     double pagerank;
 25     string doc_id;
 26     string url;
 27     int f_num;//forward num
 28     int b_num;//backward num
 29     vector<int> b_f_no;//the pages no point to this
 30     vector<string> f_url;//forward url
 31 };
 33 //page_info vector
 34 vector<PAGE_INFO> page_info;
 35 int page_info_num = 0;
 37 //map for url to page_info no for computing backward information
 38 map<string, int> url_page;
 40 //vector to record dangling links
 41 vector<int> dangling_link_flag;
 43 /*void MakeUrlPages(string url, int num)
 44 {
 45     if(map.count(url)==0)
 46     {
 47         vector<int> temp_page_no;
 48         temp_page_no.insert(num);
 49         url_pages.insert(url,temp_page_no);
 50     }
 51     else
 52     {
 53         url_pages[url].insert(num);
 54     }
 55 }*/
 57 //scan page file one pass to record url_page_no
 58 void ScanPageFile(const char *page_file)
 59 {
 60     cout<<"Starting scaning page file..."<<endl;
 61     ifstream filein(page_file);
 62     if(!filein)
 63     {
 64         cout<<"Open page file error."<<endl;
 65         exit(-1);
 66     }
 67     string line;
 68     string html;
 69     int pos_beg=0,pos_end=0;
 70     int num = 0;
 71     while(getline(filein,line))
 72     {
 73         pos_beg = 0;
 75         //split \t three parts of one line:doc_id, url, html
 76         pos_end = line.find("\t");
 77         if(pos_end==string::npos)
 78         {
 79             cout<<num<<":line format is wrong."<<endl;
 80             continue;
 81         }
 82         pos_beg = pos_end+1;
 83         pos_end = line.find("\t",pos_beg);
 84         if(pos_end==string::npos)
 85         {
 86             url_page.insert(map<string,int>::value_type (line.substr(pos_beg), num));
 87         }
 88         else url_page.insert(pair<string,int>(line.substr(pos_beg, pos_end-pos_beg), num));
 89         num ++;
 90     }
 91     cout<<"Finished scaning page file."<<endl;
 92 }
 94 //scan original page to obtain web_info
 95 void ScanPage(const char *page_file)
 96 {
 97     cout<<"Starting scaning page..."<<endl;
 98     ifstream filein(page_file);
 99     if(!filein)
100     {
101         cout<<"Open page file error."<<endl;
102         exit(-1);
103     }
104         string line;
105     string html;
106     int pos_beg=0,pos_end=0;
107     int num = 0;
108     while(getline(filein,line))
109     {
110        // cout<<num<<":begin scaning line..."<<endl;
111         PAGE_INFO temp_page_info;
112         temp_page_info.pagerank = -1;
113         temp_page_info.f_num = 0;
114         temp_page_info.b_num = 0;
115         pos_beg = 0;
117         //split \t three parts of one line:doc_id, url, html
118         pos_end = line.find("\t");
119         if(pos_end==string::npos)
120         {
121             cout<<num<<":line format is wrong."<<endl;
122             continue;
123         }
124        // cout<<num<<":begin to doc_id..."<<endl;
125         temp_page_info.doc_id = line.substr(pos_beg, pos_end-pos_beg);
126         //MakeUrlPages(page_web_info.doc_id,num);
127         //num ++;
128         dangling_link_flag.push_back(1);
129         pos_beg = pos_end+1;
130         pos_end = line.find("\t",pos_beg);
131         if(pos_end==string::npos)
132         {
133             temp_page_info.f_num = 0;
134             temp_page_info.url = line.substr(pos_beg);
135            // url_page.insert(map<string,int>::value_type (temp_page_info.url,num));
136             page_info.push_back(temp_page_info);
137             num ++;
138             continue;
139         }
140        // cout<<num<<":begin assign url..."<<endl;
141        // temp_page_info.url = line.substr(pos_beg, pos_end-pos_beg);
142         url_page.insert(pair<string,int>(temp_page_info.url, num));
143         num ++;
144         pos_beg = pos_end+1;
145         pos_end = line.find("\t",pos_beg);
146         set<string> temp_url;//uniq the url in one page
147        // cout<<num-1<<":begin to create temp_url..."<<endl;
148         while(pos_end!=string::npos)
149         {
150             //temp_web_info.f_num ++;
151             if(url_page.count(line.substr(pos_beg,pos_end-pos_beg))!=0)
152                 temp_url.insert(line.substr(pos_beg, pos_end-pos_beg));
153             //temp_web_info.f_url.insert(line.substr(pos_beg, pos_end));
154             pos_beg = pos_end+1;
155             pos_end = line.find("\t",pos_beg);
156         }
157         //temp_web_info.f_num ++;
158         //temp_web_info.f_url.insert(line.substr(pos_beg));
159         if(url_page.count(line.substr(pos_beg))!=0)
160             temp_url.insert(line.substr(pos_beg));
161        // cout<<num-1<<":begin temp_url to f_url..."<<endl;
162         for(set<string>::iterator it = temp_url.begin();it!=temp_url.end();it++)
163         {
164             temp_page_info.f_num ++;
165             temp_page_info.f_url.push_back(*it);
166         }
167        // cout<<num-1<<":temp_url to f_url success."<<endl;
168         page_info.push_back(temp_page_info);
169        // cout<<"doc_id: "<<page_info[num-1].doc_id<<"\t";
170        // cout<<"url: "<<page_info[num-1].url<<endl;
171        // cout<<"forward url:\t";
172        // for(int j=0;j<page_info[num-1].f_num;j++)
173        // {
174        //     cout<<page_info[num-1].f_url[j]<<"\t";
175        // }
176        // cout<<endl;
177        // cout<<"backward url(doc_id):\t";
178        // for(int j=0;j<page_info[num-1].b_num;j++)
179        // {
180        //     cout<<page_info[num-1].b_f_no[j]<<"\t";
181        // }
182        // cout<<endl;
184     }
185     filein.close();
186     //cout<<"begin scan page_info and map:url_page"<<endl;
187     //scan the page_iofo and map:url_page to complete backward info
188     for(int i=0;i<page_info.size();i++)
189     {
190         for(int j=0;j<page_info[i].f_url.size();j++)
191         {
192             if(url_page.count(page_info[i].f_url[j])!=0)
193             {
194                 // for(int k=0;k<url_page[page_info[i].f_url[j]].size();k++)
195                 {
196                     //cout<<page_info[url_page[page_info[i].f_url[j]]].b_num<<endl;
197                     //cout<<j<<":"<<url_page[page_info[i].f_url[j]]<<endl;
198                     page_info[url_page[page_info[i].f_url[j]]].b_f_no.push_back(i);
199                     page_info[url_page[page_info[i].f_url[j]]].b_num ++;
200                 }
201              }
202         }
203     }
204     //cout<<num-1<<":finish scan page_info adn map:url_page"<<endl;
206     page_info_num = page_info.size();
207     cout<<"Page Number: "<<page_info.size()<<endl;
208     cout<<"Scan page finished."<<endl;
209     /*for(int i=0;i<page_info.size();i++)
210     {
211         cout<<"doc_id: "<<page_info[i].doc_id<<"\t";
212         cout<<"url: "<<page_info[i].url<<endl;
213         cout<<"forward url:\t";
214         for(int j=0;j<page_info[i].f_num;j++)
215         {
216             cout<<page_info[i].f_url[j]<<"\t";
217         }
218         cout<<endl;
219         cout<<"backward url(doc_id):\t";
220         for(int j=0;j<page_info[i].b_num;j++)
221         {
222             cout<<page_info[i].b_f_no[j]<<"\t";
223         }
224         cout<<endl;
225     }*/
226 }
228 //remove the dangling links
229 void RemoveDanglingLinks()
230 {
231     cout<<"Start removing dangling links..."<<endl;
232     int dangling_link_num = 0;
233     int f = 1;
234     while(f==1)//iterate remove dangling links
235     {
236         f = 0;
237         for(int i=0;i<page_info.size();i++)
238         {
239             if(page_info[i].f_num==0&&dangling_link_flag[i]==1)//dangling links
240             {
241                 dangling_link_num ++;
242                 f = 1;
243                 dangling_link_flag[i] = 0;
244                 for(int j=0;j<page_info[i].b_num;j++)
245                 {
246                     page_info[page_info[i].b_f_no[j]].f_num --;
247                 }
248             }
249         }
250     }
251     cout<<"Dangling Links Number: "<<dangling_link_num<<endl;
252     cout<<"Remove dangling links finished."<<endl;
253 }
255 //compute the pagerank
256 void ComputePageRank()
257 {
258     cout<<"Start computing pagerank..."<<endl;
260     double d = DAMP_FACTOR;//damping factor
261     double epslon = PAGERANK_THRESHOLD;
262     double delta = 1.0;
264     //assign initial pagerank value
265     for(int i=0;i<page_info.size();i++)
266     {
267         if(dangling_link_flag[i]==1)
268         {
269             page_info[i].pagerank = 1.0;
270         }
271         else
272         {
273             page_info[i].pagerank = -1.0;
274         }
275     }
277     //cout<<"Assign initial pagerank value success."<<endl;
279     //temp pagerank value to iterator
280     //vector 2 pagerank for not assign back every iteration
281     vector<double> pagerank[2];
282     for(int i=0;i<page_info.size();i++)
283     {
284         pagerank[0].push_back(page_info[i].pagerank);
285         pagerank[1].push_back(page_info[i].pagerank);
286     }
287     int x = 0,y = 1;
289     //cout<<"Assign 2 temp vector for iteration success."<<endl; 
290     //start iteration
291     int iter_num = 0;
292     while(delta>epslon)
293     {
294         delta = 0.0;
295         for(int i=0;i<page_info.size();i++)
296         {
297             pagerank[x][i] = 0.0;
298             if(dangling_link_flag[i]==1)//not compute danglilng links
299             {
300                 for(int j=0;j<page_info[i].b_num;j++)
301                 {
302                     pagerank[x][i] += (1-d)/page_info_num + d*pagerank[y][page_info[i].b_f_no[j]]/page_info[page_info[i].b_f_no[j]].f_num;
303                 }
304                 if(pagerank[x][i]-pagerank[y][i]<0)
305                 {
306                     delta += pagerank[y][i]-pagerank[x][i];
307                 }
308                 else
309                 {
310                     delta += pagerank[x][i]-pagerank[y][i];
311                 }
312             }
313         }
314         //exchang x and y
315         int temp = x;
316         x = y;
317         y = temp;
318         if(delta<0)delta = -delta;
319         iter_num ++;
320         cout<<"iter "<<iter_num<<" diff: "<<delta<<endl;
321     }
322     //final to assign to page_info.pagerank
323     for(int i=0;i<page_info.size();i++)
324     {
325         if(dangling_link_flag[i]==1)
326         {
327             page_info[i].pagerank = pagerank[y][i];
328         }
329     }
330     cout<<"Compute pagerank finished."<<endl;
331 }
332 //Recover dangling links, namely compute the dangling links's pagerank
333 void RecoverDanglingLinks()
334 {
335     cout<<"Start recovering dangling links..."<<endl;
336     int flag = 1;
337     while(flag==1)//iterate recover dangling links
338     {
339         flag = 0;
340         for(int i=0;i<page_info.size();i++)
341         {
342             if(dangling_link_flag[i]==0)//dangling links
343             {
344                 flag = 1;
345                 int dangling_link_backward_flag = 1;
346                 for(int j=0;j<page_info[i].b_f_no.size();j++)
347                 {
348                     if(page_info[page_info[i].b_f_no[j]].pagerank<0)
349                     {
350                         dangling_link_backward_flag = 0;
351                         break;
352                     }
353                 }
354                 if(dangling_link_backward_flag==1)//all backward page of this dangling link is already having pagerank 
355                 {
356                     double i_pagerank = 0.0;
357                     for(int j =0;j<page_info[i].b_f_no.size();j++)
358                     {
359                         i_pagerank += page_info[page_info[i].b_f_no[j]].pagerank;
360                     }
361                     page_info[i].pagerank = i_pagerank;
362                     dangling_link_flag[i] = 1;//use this to flag recover dangling link
363                 }
364             }
365         }
366     }
367     cout<<"Recover dangling links finished."<<endl;
368 }
369 //output the pagerank info to file
370 void OutputPageRank(const char *pagerank_file)
371 {
372     cout<<"Start outputing pagerank result to file."<<endl;
373     ofstream fileout(pagerank_file);
374     if(!fileout)
375     {
376         cout<<"open output file: pagerank_file error."<<endl;
377         exit(-1);
378     }
379     for(int i=0;i<page_info.size();i++)
380     {
381         fileout<<page_info[i].doc_id<<"\t"<<setprecision(5)<<page_info[i].pagerank<<endl;
382     }
383     fileout.close();
384     cout<<"Output pagerank result to file finished."<<endl;
385 }
387 int main(int args, char *argv[])
388 {
389     if(args!=3)
390     {
391         cout<<"program para wrong. The first para is input page_file, and the second para is output_pagerank file."<<endl;
392         exit(-1);
393     }
395     const char *input_page_file = argv[1];
396     const char *output_pagerank_file = argv[2];
398     ScanPageFile(input_page_file);
400     ScanPage(input_page_file);
402     RemoveDanglingLinks();
404     ComputePageRank();
406     RecoverDanglingLinks();
408     OutputPageRank(output_pagerank_file);
410     return 0;
411 }
