# 深入浅出PageRank算法

### PageRank算法

PageRank算法是谷歌曾经独步天下的“倚天剑”，该算法由Larry Page和Sergey Brin在斯坦福大学读研时发明的，论文点击下载: The PageRank Citation Ranking: Bringing Order to the Web

PageRank的核心思想有2点：

1.如果一个网页被很多其他网页链接到的话说明这个网页比较重要，也就是pagerank值会相对较高；

2.如果一个pagerank值很高的网页链接到一个其他的网页，那么被链接到的网页的pagerank值会相应地因此而提高。

3.数值分析与算法 Page 161 应用实例：Google的PageRank算法

5.使用 MapReduce 思想计算 PageRank Page 62 PageRank和马尔可夫链

### 3.求解PageRank

(1) 幂法

wiki上有一个PageRank的简便算法，它不考虑转移概率，而是采用的是迭代的方式，每次都更新所有网页的pagerank值，更新的方式就是将每个网页的pagerank值平摊分给它指向的所有网页，每个网页累计所有指向它的网页平摊给它的值作为它该回合的pagerank值，直到全部网页的pagerank值收敛了或者满足一定的阈值条件就停止。

n=6;
i=[2 3 4 4 5 6 1 6 1];
j=[1 2 2 3 3 3 4 5 6];
G=sparse(i,j,1,n,n);

% Power method
for j = 1:n
L{j} = find(G(:,j));
c(j) = length(L{j});
end

p = .85;
delta = (1-p)/n;
x = ones(n,1)/n;
z = zeros(n,1);
cnt = 0;
while max(abs(x-z)) > .0001
z = x;
x = zeros(n,1);
for j = 1:n
if c(j) == 0
x = x + z(j)/n;%转移到任意一个网页
else
x(L{j}) = x(L{j}) + z(j)/c(j);%将上次的pagerank值平摊给所有指向的网页
end
end
x = p*x + delta;
cnt = cnt+1;
end


x =
0.2675
0.2524
0.1323
0.1698
0.0625
0.1156


easy_install python-graph-core
easy_install python-graph-dot


Python版本的算法实现：

# coding=utf-8

# Import graphviz
import graphviz as gv

# Import pygraph
from pygraph.classes.digraph import digraph
from pygraph.readwrite.dot import write

# Define pagerank function
def pagerank(graph, damping_factor=0.85, max_iterations=100, \
min_delta=0.00001):
"""
Compute and return the PageRank in an directed graph.

@type  graph: digraph
@param graph: Digraph.

@type  damping_factor: number
@param damping_factor: PageRank dumping factor.

@type  max_iterations: number
@param max_iterations: Maximum number of iterations.

@type  min_delta: number
@param min_delta: Smallest variation required for a new iteration.

@rtype:  Dict
@return: Dict containing all the nodes PageRank.
"""

nodes = graph.nodes()
graph_size = len(nodes)
if graph_size == 0:
return {}
# value for nodes without inbound links
min_value = (1.0-damping_factor)/graph_size

# itialize the page rank dict with 1/N for all nodes
#pagerank = dict.fromkeys(nodes, 1.0/graph_size)
pagerank = dict.fromkeys(nodes, 1.0)

for i in range(max_iterations):
diff = 0 #total difference compared to last iteraction
# computes each node PageRank based on inbound links
for node in nodes:
rank = min_value
for referring_page in graph.incidents(node):
rank += damping_factor * pagerank[referring_page] / \
len(graph.neighbors(referring_page))

diff += abs(pagerank[node] - rank)
pagerank[node] = rank

print 'This is NO.%s iteration' % (i+1)
print pagerank
print ''

#stop if PageRank has converged
if diff < min_delta:
break

return pagerank

# Graph creation
gr = digraph()

# Add nodes and edges

# Draw as PNG
# dot = write(gr)
# gvv = gv.readstring(dot)
# gv.layout(gvv,'dot')
# gv.render(gvv,'png','Model.png')

pagerank(gr)


This is NO.32 iteration
{'1': 0.2675338708706491, '3': 0.13227261904986046, '2': 0.2524037902400518, '5': 0.062477242064127136, '4': 0.1697488529161491, '6': 0.1155828978186352}


(2) 利用马尔可夫矩阵的特殊结构

function x = pagerank1(G)
% PAGERANK1  Google's PageRank modified version 1 - hujiawei

%if nargin < 3, p = .85; end
p=0.85;

% Eliminate any self-referential links

G = G - diag(diag(G));

% c = out-degree, r = in-degree

[n,n] = size(G);
c = sum(G,1);%each row's sum
r = sum(G,2);%each col's sum

% Scale column sums to be 1 (or 0 where there are no out links).

k = find(c~=0);
D = sparse(k,k,1./c(k),n,n);

% Solve (I - p*G*D)*x = e

e = ones(n,1);
I = speye(n,n);
x = (I - p*G*D)\e;

% Normalize so that sum(x) == 1.

x = x/sum(x);


(3) 巧妙解法：逆迭代算法

function x = pagerank2(G)
% PAGERANK1  Google's PageRank modified version 2 - hujiawei
% using inverse iteration method

%if nargin < 3, p = .85; end
p=0.85;

% Eliminate any self-referential links

G = G - diag(diag(G));

% c = out-degree, r = in-degree

[n,n] = size(G);
c = sum(G,1);%each row's sum
r = sum(G,2);%each col's sum

% Scale column sums to be 1 (or 0 where there are no out links).

k = find(c~=0);
D = sparse(k,k,1./c(k),n,n);

% Solve (I - p*G*D)*x = e

e = ones(n,1);
I = speye(n,n);
% x = (I - p*G*D)\e;
delta=(1-p)/n;
A=p*G*D+delta;
x=(I-A)\e;

% Normalize so that sum(x) == 1.

x = x/sum(x);


function [U,G] = surfer(root,n)
% SURFER  Create the adjacency graph of a portion of the Web.
%    [U,G] = surfer(root,n) starts at the URL root and follows
%    Web links until it forms an adjacency graph with n nodes.
%    U = a cell array of n strings, the URLs of the nodes.
%    G = an n-by-n sparse matrix with G(i,j)=1 if node j is linked to node i.
%
%    Example:  [U,G] = surfer('http://www.harvard.edu',500);
%
%    This function currently has two defects.  (1) The algorithm for
%    finding links is naive.  We just look for the string 'http:'.
%    (2) An attempt to read from a URL that is accessible, but very slow,
%    might take an unacceptably long time to complete.  In some cases,
%    it may be necessary to have the operating system terminate MATLAB.
%    Key words from such URLs can be added to the skip list in surfer.m.

% Initialize

clf
shg
set(gcf,'doublebuffer','on')
axis([0 n 0 n])
axis square
axis ij
box on
set(gca,'position',[.12 .20 .78 .78])
uicontrol('style','frame','units','normal','position',[.01 .09 .98 .07]);
uicontrol('style','frame','units','normal','position',[.01 .01 .98 .07]);
t1 = uicontrol('style','text','units','normal','position',[.02 .10 .94 .04], ...
'horiz','left');
t2 = uicontrol('style','text','units','normal','position',[.02 .02 .94 .04], ...
'horiz','left');
slow = uicontrol('style','toggle','units','normal', ...
'position',[.01 .24 .07 .05],'string','slow','value',0);
quit = uicontrol('style','toggle','units','normal', ...
'position',[.01 .17 .07 .05],'string','quit','value',0);

U = cell(n,1);
hash = zeros(n,1);
G = logical(sparse(n,n));
m = 1;
U{m} = root;
hash(m) = hashfun(root);

j = 1;
while j < n & get(quit,'value') == 0

% Try to open a page.

try
set(t1,'string',sprintf('%5d %s',j,U{j}))
set(t2,'string','');
drawnow
catch
set(t1,'string',sprintf('fail: %5d %s',j,U{j}))
drawnow
continue
end
if get(slow,'value')
pause(.25)
end

% Follow the links from the open page.

for f = findstr('http:',page);

% A link starts with 'http:' and ends with the next quote.

e = min([findstr('"',page(f:end)) findstr('''',page(f:end))]);
if isempty(e), continue, end
url = deblank(page(f:f+e-2));
url(url<' ') = '!';   % Nonprintable characters
if url(end) == '/', url(end) = []; end

% Look for links that should be skipped.

skips = {'.gif','.jpg','.pdf','.css','lmscadsi','cybernet', ...
'search.cgi','.ram','www.w3.org', ...
'scripts','netscape','shockwave','webex','fansonly'};
skip = any(url=='!') | any(url=='?');
k = 0;
while ~skip & (k < length(skips))
k = k+1;
skip = ~isempty(findstr(url,skips{k}));
end
if skip
if isempty(findstr(url,'.gif')) & isempty(findstr(url,'.jpg'))
set(t2,'string',sprintf('skip: %s',url))
drawnow
if get(slow,'value')
pause(.25)
end
end
continue
end

% Check if page is already in url list.

i = 0;
for k = find(hash(1:m) == hashfun(url))';
if isequal(U{k},url)
i = k;
break
end
end

% Add a new url to the graph there if are fewer than n.

if (i == 0) & (m < n)
m = m+1;
U{m} = url;
hash(m) = hashfun(url);
i = m;
end

if i > 0
G(i,j) = 1;
set(t2,'string',sprintf('%5d %s',i,url))
line(j,i,'marker','.','markersize',6)
drawnow
if get(slow,'value')
pause(.25)
end
end
end

j = j+1;
end
delete(t1)
delete(t2)
delete(slow)
set(quit,'string','close','callback','close(gcf)','value',0)

%------------------------

function h = hashfun(url)
% Almost unique numeric hash code for pages already visited.
h = length(url) + 1024*sum(url);


### 4.MapReduce框架下PageRank算法的实现

#### 1.输入输出格式

map函数的输入是<节点，从该节点引出的边列表>，其中节点是一个类，包含了其当前的pagerank值，输出是<节点，反向节点pagerank值/反向节点引出边的总数>；

reduce函数的输入是<节点，反向节点pagerank值/反向节点引出边的总数>，输出是<节点，从该节点引出的边列表>，其中节点包含了其更新后的pagerank值。

process the data to the form of {node i:[its adjacent node list],...}
while the sum of difference between the last two pagerank values < threshold
map({node i:[its adjacent node list],...}):
map_output={}
for every node j in adjacent node list:
put or sum up {j:(i, PageRank(i)/length(adjacent node list))} into map_output
return map_output

reduce(map_output):
reduce_output={}
for every entry {j:(i, PageRank(i)/length(adjacent node list))} in map_output:
put or sum up all values pagerank values for node j with its adjacent node list into reduce_output
return reduce_output



#### 2.示例演示

map的输入输出结果如下：

reduce的输入输出结果如下，输入是2个mapper的输出，输出的结果中更新了节点的pagerank值

reducer处理完了之后又将它的结果输入给mapper处理，直到迭代的次数超过了设定值或者两次迭代之后得到的所有节点的pagerank值之差的总和(也可以是取二范数)小于设定的阈值。

#### 3.示例的实验结果

(1)首先是使用Matlab采用幂法的方式计算出在p=1.0的情况下示例得到的结果 [它的主要作用是验证后面python版本的正确性]

matlab源码如下：

n=4;
i=[2 3 4 3 4 4 1 2];
j=[1 1 1 2 2 3 3 4];
G=sparse(i,j,1,n,n);

[n,n] = size(G);
for j = 1:n
L{j} = find(G(:,j));
c(j) = length(L{j});
end

% Power method
p=1.0;
delta = (1-p)/n;
x = ones(n,1)/n;
z = zeros(n,1);
cnt = 0;
while max(abs(x-z)) > .0001
z = x;
x = zeros(n,1);
for j = 1:n
if c(j) == 0
x = x + z(j)/n;
else
x(L{j}) = x(L{j}) + z(j)/c(j);
end
end
x = p*x + delta;
cnt = cnt+1;
end
sprintf('pagerank result:')
x


0.1072
0.3571
0.2143
0.3214


(2)matlab版本的page rank没有采用mapreduce的思想进行迭代，所以我另外写了一个python版本的利用mapreduce思想实现的pagerank算法(注：我并没有使用python的map和reduce函数去实现，而是使用更加容易明白的实现)，使用的阈值为0.0001，最多迭代的次数为100次。

# coding=utf-8

__author__ = 'hujiawei'
__doc__ = 'pagerank mapreduce'

class Node:
def __init__(self,id,pk):
self.id=id
self.pk=pk

def pk_map(map_input):
map_output={}
for node,outlinks in map_input.items():
if link in map_output:
else:
return map_output

def pk_reduce(reduce_input):
for result in reduce_input:
for node,value in result.items():
node.pk+=value

def pk_clear(nodes):
for node in nodes:
node.pk=0

def pk_last(nodes):
lastnodes=[]
for node in nodes:
lastnodes.append(Node(node.id,node.pk))
return lastnodes

def pk_diff(nodes,lastnodes):
diff=0
for i in range(len(nodes)):
print('node pk %f, last node pk %f ' % (nodes[i].pk, lastnodes[i].pk))
diff+=abs(nodes[i].pk-lastnodes[i].pk)
return diff

def pk_test1():
node1 = Node(1, 0.25)
node2 = Node(2, 0.25)
node3 = Node(3, 0.25)
node4 = Node(4, 0.25)
nodes = [node1, node2, node3, node4]
threshold = 0.0001
max_iters = 100

for iter_count in range(max_iters):
iter_count += 1
lastnodes=pk_last(nodes)
print('============ map count %d =================' % (iter_count))
in1 = {node1: [node2, node3, node4], node2: [node3, node4]}
in2 = {node3: [node1, node4], node4: [node2]}

mapout1 = pk_map(in1)
mapout2 = pk_map(in2)

for node, value in mapout1.items():
print str(node.id) + ' ' + str(value)

for node, value in mapout2.items():
print str(node.id) + ' ' + str(value)

print('============ reduce count %d =================' % (iter_count))

reducein = [mapout1, mapout2]
pk_clear(nodes)
pk_reduce(reducein)

for node in nodes:
print str(node.id) + ' ' + str(node.pk)

diff=pk_diff(nodes,lastnodes)
if diff < threshold:
break

if __name__ == '__main__':
pk_test1()


1 0.107138774577
2 0.35712924859
3 0.214296601128
4 0.321435375705


OK，差不多了，希望对需要理解PageRank算法的人有帮助！ :-)

[链接]

356 声望
42 粉丝
0 条评论