You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
361 lines
14 KiB
Python
361 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
|
|
# Importing the libraries
|
|
import os
|
|
import wget
|
|
import gzip
|
|
import time
|
|
import scipy as sp
|
|
import numpy as np
|
|
import pandas as pd
|
|
import networkx as nx
|
|
from os.path import exists
|
|
from scipy.sparse import *
|
|
import plotly.graph_objs as go
|
|
from typing import Literal
|
|
|
|
def load_data(dataset: Literal["Stanford", "BerkStan"]) -> nx.Graph:
|
|
|
|
"""Load the dataset and return a graph.
|
|
|
|
Parameters
|
|
----------
|
|
dataset : Literal["Stanford", "BerkStan"]
|
|
The dataset to load.
|
|
|
|
Returns
|
|
-------
|
|
nx.Graph
|
|
The graph of the dataset.
|
|
data/web-Stanford.txt
|
|
|
|
"""
|
|
|
|
# check if there is a data folder
|
|
if not exists(os.path.join(os.getcwd(), "data")):
|
|
os.mkdir(os.path.join(os.getcwd(), "data"))
|
|
|
|
# Download the dataset
|
|
if not exists(f"data/Web-{dataset}.txt.gz"):
|
|
print(f"\nDownloading the dataset {dataset}...")
|
|
wget.download(f"http://snap.stanford.edu/data/web-{dataset}.txt.gz", out=f"data/Web-{dataset}.txt.gz")
|
|
else:
|
|
print(f"\nThe dataset {dataset} is already downloaded.")
|
|
|
|
# unzip the dataset
|
|
if not exists(f"data/Web-{dataset}.txt"):
|
|
print(f"\nUnzipping the dataset {dataset}...")
|
|
with gzip.open(f"data/Web-{dataset}.txt.gz", "rb") as f_in:
|
|
with open(f"data/Web-{dataset}.txt", "wb") as f_out:
|
|
f_out.write(f_in.read())
|
|
|
|
# create the graph
|
|
print(f"\nCreating the graph of the dataset {dataset}...\n")
|
|
G_dataset = nx.read_edgelist(f"data/Web-{dataset}.txt", create_using=nx.DiGraph(), nodetype=int)
|
|
|
|
return G_dataset
|
|
|
|
def google_matrix(G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None) -> np.matrix:
|
|
|
|
|
|
"""Returns the Google matrix of the graph.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph. Undirected graphs will be converted to a directed
|
|
graph with two directed edges for each undirected edge.
|
|
|
|
alpha : float
|
|
The damping factor.
|
|
|
|
personalization: dict, optional
|
|
The "personalization vector" consisting of a dictionary with a
|
|
key some subset of graph nodes and personalization value each of those.
|
|
At least one personalization value must be non-zero.
|
|
If not specfiied, a nodes personalization value will be zero.
|
|
By default, a uniform distribution is used.
|
|
|
|
nodelist : list, optional
|
|
The rows and columns are ordered according to the nodes in nodelist.
|
|
If nodelist is None, then the ordering is produced by G.nodes().
|
|
|
|
weight : key, optional
|
|
Edge data key to use as weight. If None weights are set to 1.
|
|
|
|
dangling: dict, optional
|
|
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
|
any outedges. The dict key is the node the outedge points to and the dict
|
|
value is the weight of that outedge. By default, dangling nodes are given
|
|
outedges according to the personalization vector (uniform if not
|
|
specified) This must be selected to result in an irreducible transition
|
|
matrix (see notes below). It may be common to have the dangling dict to
|
|
be the same as the personalization dict.
|
|
|
|
Returns
|
|
-------
|
|
A : NumPy matrix
|
|
Google matrix of the graph
|
|
|
|
Notes
|
|
-----
|
|
The matrix returned represents the transition matrix that describes the
|
|
Markov chain used in PageRank. For PageRank to converge to a unique
|
|
solution (i.e., a unique stationary distribution in a Markov chain), the
|
|
transition matrix must be irreducible. In other words, it must be that
|
|
there exists a path between every pair of nodes in the graph, or else there
|
|
is the potential of "rank sinks."
|
|
|
|
"""
|
|
if nodelist is None:
|
|
nodelist = list(G)
|
|
|
|
A = nx.to_numpy_array(G, nodelist=nodelist, weight=weight)
|
|
N = len(G)
|
|
if N == 0:
|
|
# TODO: Remove np.asmatrix wrapper in version 3.0
|
|
return np.asmatrix(A)
|
|
|
|
# Personalization vector
|
|
if personalization is None:
|
|
p = np.repeat(1.0 / N, N)
|
|
else:
|
|
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
|
|
if p.sum() == 0:
|
|
raise ZeroDivisionError
|
|
p /= p.sum()
|
|
|
|
# Dangling nodes
|
|
if dangling is None:
|
|
dangling_weights = p
|
|
else:
|
|
# Convert the dangling dictionary into an array in nodelist order
|
|
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
|
|
dangling_weights /= dangling_weights.sum()
|
|
dangling_nodes = np.where(A.sum(axis=1) == 0)[0]
|
|
|
|
# Assign dangling_weights to any dangling nodes (nodes with no out links)
|
|
A[dangling_nodes] = dangling_weights
|
|
|
|
A /= A.sum(axis=1)[:, np.newaxis] # Normalize rows to sum to 1
|
|
|
|
return np.asmatrix(alpha * A + (1 - alpha) * p)
|
|
|
|
def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", dangling=None):
|
|
"""Returns the PageRank of the nodes in the graph.
|
|
|
|
PageRank computes a ranking of the nodes in the graph G based on
|
|
the structure of the incoming links. It was originally designed as
|
|
an algorithm to rank web pages.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph. Undirected graphs will be converted to a directed
|
|
graph with two directed edges for each undirected edge.
|
|
|
|
alpha : float, optional
|
|
Damping parameter for PageRank, default=0.85.
|
|
|
|
personalization: dict, optional
|
|
The "personalization vector" consisting of a dictionary with a
|
|
key some subset of graph nodes and personalization value each of those.
|
|
At least one personalization value must be non-zero.
|
|
If not specfiied, a nodes personalization value will be zero.
|
|
By default, a uniform distribution is used.
|
|
|
|
weight : key, optional
|
|
Edge data key to use as weight. If None weights are set to 1.
|
|
|
|
dangling: dict, optional
|
|
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
|
any outedges. The dict key is the node the outedge points to and the dict
|
|
value is the weight of that outedge. By default, dangling nodes are given
|
|
outedges according to the personalization vector (uniform if not
|
|
specified) This must be selected to result in an irreducible transition
|
|
matrix (see notes under google_matrix). It may be common to have the
|
|
dangling dict to be the same as the personalization dict.
|
|
|
|
Returns
|
|
-------
|
|
pagerank : dictionary
|
|
Dictionary of nodes with PageRank as value.
|
|
|
|
|
|
Notes
|
|
-----
|
|
The eigenvector calculation uses NumPy's interface to the LAPACK
|
|
eigenvalue solvers. This will be the fastest and most accurate
|
|
for small graphs.
|
|
|
|
"""
|
|
if len(G) == 0:
|
|
return {}
|
|
M = google_matrix(
|
|
G, alpha, personalization=personalization, weight=weight, dangling=dangling
|
|
)
|
|
# use numpy LAPACK solver
|
|
eigenvalues, eigenvectors = np.linalg.eig(M.T)
|
|
ind = np.argmax(eigenvalues)
|
|
# eigenvector of largest eigenvalue is at ind, normalized
|
|
largest = np.array(eigenvectors[:, ind]).flatten().real
|
|
norm = largest.sum()
|
|
return dict(zip(G, map(float, largest / norm)))
|
|
|
|
def pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-9, nstart=None, weight="weight", dangling=None,):
|
|
|
|
"""
|
|
Returns the PageRank of the nodes in the graph.
|
|
|
|
PageRank computes a ranking of the nodes in the graph G based on
|
|
the structure of the incoming links. It was originally designed as
|
|
an algorithm to rank web pages.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph. Undirected graphs will be converted to a directed
|
|
graph with two directed edges for each undirected edge.
|
|
|
|
alpha : float, optional
|
|
Damping parameter for PageRank, default=0.85.
|
|
|
|
personalization: dict, optional
|
|
The "personalization vector" consisting of a dictionary with a
|
|
key some subset of graph nodes and personalization value each of those.
|
|
At least one personalization value must be non-zero.
|
|
If not specfiied, a nodes personalization value will be zero.
|
|
By default, a uniform distribution is used.
|
|
|
|
max_iter : integer, optional
|
|
Maximum number of iterations in power method eigenvalue solver.
|
|
|
|
tol : float, optional
|
|
Error tolerance used to check convergence in power method solver.
|
|
|
|
nstart : dictionary, optional
|
|
Starting value of PageRank iteration for each node.
|
|
|
|
weight : key, optional
|
|
Edge data key to use as weight. If None weights are set to 1.
|
|
|
|
dangling: dict, optional
|
|
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
|
any outedges. The dict key is the node the outedge points to and the dict
|
|
value is the weight of that outedge. By default, dangling nodes are given
|
|
outedges according to the personalization vector (uniform if not
|
|
specified) This must be selected to result in an irreducible transition
|
|
matrix (see notes under google_matrix). It may be common to have the
|
|
dangling dict to be the same as the personalization dict.
|
|
|
|
Returns
|
|
-------
|
|
pagerank : dictionary
|
|
Dictionary of nodes with PageRank as value
|
|
|
|
Notes
|
|
-----
|
|
The eigenvector calculation uses power iteration with a SciPy
|
|
sparse matrix representation.
|
|
|
|
Raises
|
|
------
|
|
PowerIterationFailedConvergence
|
|
If the algorithm fails to converge to the specified tolerance
|
|
within the specified number of iterations of the power iteration
|
|
method.
|
|
"""
|
|
|
|
N = len(G)
|
|
if N == 0:
|
|
return {}
|
|
|
|
nodelist = list(G)
|
|
A = nx.to_scipy_sparse_array(G, nodelist=nodelist, weight=weight, dtype=float)
|
|
S = A.sum(axis=1) # S[i] is the sum of the weights of edges going out of node i
|
|
S[S != 0] = 1.0 / S[S != 0] # S[i] is now the sum of the weights of edges going into node i
|
|
Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) # Q is the matrix of edge weights going into each node
|
|
A = Q @ A # A is now the "stochastic matrix"
|
|
|
|
# initial vector
|
|
if nstart is None: # if no initial vector is specified, start with a uniform vector
|
|
x = np.repeat(1.0 / N, N) # x is the vector of PageRank values
|
|
else: # if an initial vector is specified, normalize it
|
|
x = np.array([nstart.get(n, 0) for n in nodelist], dtype=float) # x is the vector of PageRank values
|
|
x /= x.sum() # normalize x
|
|
|
|
# Personalization vector
|
|
if personalization is None: # if no personalization vector is specified, use a uniform vector
|
|
p = np.repeat(1.0 / N, N) # p is the personalization vector
|
|
else: # if a personalization vector is specified, normalize it
|
|
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float) # p is the personalization vector
|
|
if p.sum() == 0: # if the personalization vector is all zeros, use a uniform vector
|
|
raise ZeroDivisionError
|
|
p /= p.sum() # normalize p
|
|
# Dangling nodes
|
|
if dangling is None: # if no dangling nodes are specified, use a uniform vector
|
|
dangling_weights = p # dangling_weights is the vector of dangling node weights
|
|
else:
|
|
# Convert the dangling dictionary into an array in nodelist order
|
|
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float) # dangling_weights is the vector of dangling node weights
|
|
dangling_weights /= dangling_weights.sum() # normalize dangling_weights
|
|
is_dangling = np.where(S == 0)[0] # is_dangling is the list of dangling nodes
|
|
|
|
# power iteration: make up to max_iter iterations
|
|
iter = 1
|
|
for _ in range(max_iter):
|
|
iter += 1
|
|
xlast = x # xlast is the previous vector of PageRank values
|
|
x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p # x is the current vector of PageRank values
|
|
# check convergence, l1 norm
|
|
err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values
|
|
if err < N * tol: # if the error is small enough, stop iterating
|
|
return dict(zip(nodelist, map(float, x))), iter, alpha, tol # return the current vector of PageRank values
|
|
raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error
|
|
|
|
def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=100, tol=1.0e-9):
|
|
|
|
N = len(G)
|
|
if N == 0:
|
|
return {}
|
|
|
|
|
|
nodelist = list(G)
|
|
A = nx.to_scipy_sparse_array(G, nodelist=nodelist, dtype=float)
|
|
S = A.sum(axis=1) # S[i] is the sum of the weights of edges going out of node i
|
|
S[S != 0] = 1.0 / S[S != 0] # S[i] is now the sum of the weights of edges going into node i
|
|
Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) # Q is the matrix of edge weights going into each node
|
|
A = Q
|
|
|
|
|
|
x = np.repeat(1.0 / N, N) # x is the vector of PageRank values
|
|
v = np.repeat(1.0 / N, N) # p is the personalization vector
|
|
|
|
mu = A @ v - v # mu is the vector of PageRank values for the random walk with restart
|
|
|
|
for i in range(len(alphas)):
|
|
# create a vector r of len(alphas) where r[i] = alpha[i] * mu
|
|
r = alphas[i] * mu
|
|
Res = np.linalg.norm(r, 2)
|
|
|
|
if Res >= tol:
|
|
x = r + v # update x
|
|
|
|
iter = 1
|
|
for _ in range(max_iter):
|
|
xlast = x
|
|
iter += 1
|
|
mu = A @ x - x
|
|
for i in range(len(alphas)):
|
|
r = alphas[i]**(iter+1) * mu
|
|
Res = np.linalg.norm(r, 2)
|
|
if Res >= tol:
|
|
x = r + x
|
|
|
|
err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values
|
|
|
|
if err < tol:
|
|
return dict(zip(nodelist, map(float, x))), iter, alphas, tol
|
|
|
|
raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error
|