You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
523 lines
20 KiB
Python
523 lines
20 KiB
Python
import os
|
|
import wget
|
|
import gzip
|
|
import time
|
|
import scipy as sp
|
|
import numpy as np
|
|
import pandas as pd
|
|
import networkx as nx
|
|
from os.path import exists
|
|
from typing import Literal
|
|
|
|
def load_data(dataset: Literal["Stanford", "NotreDame", "BerkStan"]) -> nx.Graph:
|
|
"""Load the dataset and return a graph.
|
|
|
|
Parameters
|
|
----------
|
|
dataset : Literal["Stanford", "BerkStan", "NotreDame"]
|
|
The dataset to load.
|
|
|
|
Returns
|
|
-------
|
|
nx.Graph
|
|
The graph of the dataset loaded.
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
If the dataset is not valid.
|
|
|
|
Notes
|
|
-----
|
|
The datasets are downloaded from the following link:
|
|
|
|
http://snap.stanford.edu/data/web-NotreDame.html
|
|
http://snap.stanford.edu/data/web-Stanford.html
|
|
http://snap.stanford.edu/data/web-BerkStan.html
|
|
|
|
If the dataset is already downloaded, it is not downloaded again.
|
|
"""
|
|
|
|
# check if there is a data folder
|
|
if not exists(os.path.join("data")):
|
|
os.mkdir(os.path.join("data"))
|
|
if dataset not in ["Stanford", "NotreDame", "BerkStan"]:
|
|
raise ValueError("Invalid dataset. Please choose a valid dataset.")
|
|
|
|
# Download the dataset
|
|
if not exists(f"data/Web-{dataset}.txt.gz"):
|
|
print(f"\nDownloading the dataset {dataset}...")
|
|
wget.download(f"http://snap.stanford.edu/data/web-{dataset}.txt.gz", out=f"data/Web-{dataset}.txt.gz")
|
|
else:
|
|
print(f"\nThe dataset {dataset} is already downloaded.")
|
|
|
|
# unzip the dataset
|
|
if not exists(f"data/Web-{dataset}.txt"):
|
|
print(f"\nUnzipping the dataset {dataset}...")
|
|
with gzip.open(f"data/Web-{dataset}.txt.gz", "rb") as f_in:
|
|
with open(f"data/Web-{dataset}.txt", "wb") as f_out:
|
|
f_out.write(f_in.read())
|
|
|
|
# create the graph
|
|
print(f"\nCreating the graph of the dataset {dataset}...\n")
|
|
G_dataset = nx.read_edgelist(f"data/Web-{dataset}.txt", create_using=nx.DiGraph(), nodetype=int)
|
|
print(f"\tNumber of nodes: {G_dataset.number_of_nodes()}")
|
|
print(f"\tNumber of edges: {G_dataset.number_of_edges()}")
|
|
|
|
return G_dataset
|
|
|
|
|
|
def google_matrix(G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None) -> np.matrix:
|
|
|
|
"""Returns the Google matrix of the graph. NetworkX implementation.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph. Undirected graphs will be converted to a directed
|
|
graph with two directed edges for each undirected edge.
|
|
|
|
alpha : float
|
|
The damping factor.
|
|
|
|
personalization: dict, optional
|
|
The "personalization vector" consisting of a dictionary with a
|
|
key some subset of graph nodes and personalization value each of those.
|
|
At least one personalization value must be non-zero.
|
|
If not specfiied, a nodes personalization value will be zero.
|
|
By default, a uniform distribution is used.
|
|
|
|
nodelist : list, optional
|
|
The rows and columns are ordered according to the nodes in nodelist.
|
|
If nodelist is None, then the ordering is produced by G.nodes().
|
|
|
|
weight : key, optional
|
|
Edge data key to use as weight. If None weights are set to 1.
|
|
|
|
dangling: dict, optional
|
|
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
|
any outedges. The dict key is the node the outedge points to and the dict
|
|
value is the weight of that outedge. By default, dangling nodes are given
|
|
outedges according to the personalization vector (uniform if not
|
|
specified) This must be selected to result in an irreducible transition
|
|
matrix (see notes below). It may be common to have the dangling dict to
|
|
be the same as the personalization dict.
|
|
|
|
Returns
|
|
-------
|
|
A : NumPy matrix
|
|
Google matrix of the graph
|
|
|
|
Notes
|
|
-----
|
|
DO NOT USE THIS FUNCTION FOR LARGE GRAPHS. It's memory intensive.
|
|
|
|
The matrix returned represents the transition matrix that describes the
|
|
Markov chain used in PageRank. For PageRank to converge to a unique
|
|
solution (i.e., a unique stationary distribution in a Markov chain), the
|
|
transition matrix must be irreducible. In other words, it must be that
|
|
there exists a path between every pair of nodes in the graph, or else there
|
|
is the potential of "rank sinks."
|
|
|
|
"""
|
|
if nodelist is None:
|
|
nodelist = list(G)
|
|
|
|
A = nx.to_numpy_array(G, nodelist=nodelist, weight=weight)
|
|
N = len(G)
|
|
if N == 0:
|
|
# TODO: Remove np.asmatrix wrapper in version 3.0
|
|
return np.asmatrix(A)
|
|
|
|
# Personalization vector
|
|
if personalization is None:
|
|
p = np.repeat(1.0 / N, N)
|
|
else:
|
|
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
|
|
if p.sum() == 0:
|
|
raise ZeroDivisionError
|
|
p /= p.sum()
|
|
|
|
# Dangling nodes
|
|
if dangling is None:
|
|
dangling_weights = p
|
|
else:
|
|
# Convert the dangling dictionary into an array in nodelist order
|
|
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
|
|
dangling_weights /= dangling_weights.sum()
|
|
|
|
# Assign dangling weights to any dangling nodes (nodes with no out links)
|
|
dangling_nodes = np.where(A.sum(axis=1) == 0)[0]
|
|
|
|
# Assign dangling_weights to any dangling nodes (nodes with no out links)
|
|
A[dangling_nodes] = dangling_weights
|
|
|
|
A /= A.sum(axis=1)[:, np.newaxis] # Normalize rows to sum to 1
|
|
|
|
return np.asmatrix(alpha * A + (1 - alpha) * p)
|
|
|
|
|
|
def google_matrix_sparse(G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None) -> np.matrix:
|
|
|
|
""" Revised NetworkX implementation for sparse matrices. Returns the Ptilde matrix of the graph instead of the Google matrix.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph. Undirected graphs will be converted to a directed
|
|
graph with two directed edges for each undirected edge.
|
|
|
|
alpha : float
|
|
The damping factor.
|
|
|
|
personalization: dict, optional
|
|
The "personalization vector" consisting of a dictionary with a
|
|
key some subset of graph nodes and personalization value each of those.
|
|
At least one personalization value must be non-zero.
|
|
If not specfiied, a nodes personalization value will be zero.
|
|
By default, a uniform distribution is used.
|
|
|
|
nodelist : list, optional
|
|
The rows and columns are ordered according to the nodes in nodelist.
|
|
If nodelist is None, then the ordering is produced by G.nodes().
|
|
|
|
weight : key, optional
|
|
Edge data key to use as weight. If None weights are set to 1.
|
|
|
|
dangling: dict, optional
|
|
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
|
any outedges. The dict key is the node the outedge points to and the dict
|
|
value is the weight of that outedge. By default, dangling nodes are given
|
|
outedges according to the personalization vector (uniform if not
|
|
specified) This must be selected to result in an irreducible transition
|
|
matrix (see notes below). It may be common to have the dangling dict to
|
|
be the same as the personalization dict.
|
|
|
|
Returns
|
|
-------
|
|
A : NumPy matrix
|
|
Google matrix of the graph
|
|
|
|
Notes
|
|
-----
|
|
This matrix i strictly speaking not the Google matrix, but the Ptilde matrix, described in the paper [1]
|
|
|
|
|
|
References
|
|
----------
|
|
[1] Zhao-Li Shen, Meng Su, Bruno Carpentieri, and Chun Wen. Shifted power-gmres method accelerated by extrapolation for solving pagerank with multiple damping factors. Applied Mathematics and Computation, 420:126799, 2022
|
|
|
|
"""
|
|
if nodelist is None:
|
|
nodelist = list(G)
|
|
|
|
A = nx.to_scipy_sparse_array(G, nodelist=nodelist, weight=weight, format="lil", dtype=int)
|
|
|
|
N = len(G)
|
|
if N == 0:
|
|
return np.asmatrix(A)
|
|
|
|
# Personalization vector
|
|
if personalization is None:
|
|
p = np.repeat(1.0 / N, N)
|
|
p = sp.sparse.lil_array(p)
|
|
else:
|
|
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
|
|
if p.sum() == 0:
|
|
raise ZeroDivisionError
|
|
p /= p.sum()
|
|
|
|
# Dangling nodes
|
|
if dangling is None:
|
|
dangling_weights = np.ones(N, dtype=int)
|
|
dangling_weights = sp.sparse.lil_array(dangling_weights, dtype=int)
|
|
else:
|
|
# Convert the dangling dictionary into an array in nodelist order
|
|
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
|
|
dangling_weights /= dangling_weights.sum()
|
|
|
|
# Assign dangling_weights to any dangling nodes (nodes with no out links).
|
|
A[[A.sum(axis=1)==0],:] = dangling_weights
|
|
|
|
# Normalize rows
|
|
row_sums = A.sum(axis=1) # row sums
|
|
r_inv = np.power(row_sums.astype(float), -1).flatten() # inverse of row sums
|
|
r_inv[np.isinf(r_inv)] = 0.0 # replace inf with 0
|
|
R = sp.sparse.diags(r_inv) # create diagonal matrix
|
|
A = R.dot(A) # normalize rows
|
|
|
|
return A, p
|
|
|
|
|
|
def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", dangling=None):
|
|
"""Returns the PageRank of the nodes in the graph. NetworkX implementation.
|
|
|
|
PageRank computes a ranking of the nodes in the graph G based on
|
|
the structure of the incoming links. It was originally designed as
|
|
an algorithm to rank web pages.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph. Undirected graphs will be converted to a directed
|
|
graph with two directed edges for each undirected edge.
|
|
|
|
alpha : float, optional
|
|
Damping parameter for PageRank, default=0.85.
|
|
|
|
personalization: dict, optional
|
|
The "personalization vector" consisting of a dictionary with a
|
|
key some subset of graph nodes and personalization value each of those.
|
|
At least one personalization value must be non-zero.
|
|
If not specfiied, a nodes personalization value will be zero.
|
|
By default, a uniform distribution is used.
|
|
|
|
weight : key, optional
|
|
Edge data key to use as weight. If None weights are set to 1.
|
|
|
|
dangling: dict, optional
|
|
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
|
any outedges. The dict key is the node the outedge points to and the dict
|
|
value is the weight of that outedge. By default, dangling nodes are given
|
|
outedges according to the personalization vector (uniform if not
|
|
specified) This must be selected to result in an irreducible transition
|
|
matrix (see notes under google_matrix). It may be common to have the
|
|
dangling dict to be the same as the personalization dict.
|
|
|
|
Returns
|
|
-------
|
|
pagerank : dictionary
|
|
Dictionary of nodes with PageRank as value.
|
|
|
|
|
|
Notes
|
|
-----
|
|
The eigenvector calculation uses NumPy's interface to the LAPACK
|
|
eigenvalue solvers. This will be the fastest and most accurate
|
|
for small graphs.
|
|
|
|
"""
|
|
if len(G) == 0:
|
|
return {}
|
|
M = google_matrix(
|
|
G, alpha, personalization=personalization, weight=weight, dangling=dangling
|
|
)
|
|
# use numpy LAPACK solver
|
|
eigenvalues, eigenvectors = np.linalg.eig(M.T)
|
|
ind = np.argmax(eigenvalues)
|
|
# eigenvector of largest eigenvalue is at ind, normalized
|
|
largest = np.array(eigenvectors[:, ind]).flatten().real
|
|
norm = largest.sum()
|
|
return dict(zip(G, map(float, largest / norm)))
|
|
|
|
|
|
def pagerank(G, alpha=0.85, personalization=None, max_iter=10000, tol=1.0e-9, nstart=None, weight="weight", dangling=None,):
|
|
|
|
"""
|
|
Returns the PageRank of the nodes in the graph. Slighly modified NetworkX implementation.
|
|
|
|
PageRank computes a ranking of the nodes in the graph G based on
|
|
the structure of the incoming links. It was originally designed as
|
|
an algorithm to rank web pages.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph. Undirected graphs will be converted to a directed
|
|
graph with two directed edges for each undirected edge.
|
|
|
|
alpha : float, optional
|
|
Damping parameter for PageRank, default=0.85.
|
|
|
|
personalization: dict, optional
|
|
The "personalization vector" consisting of a dictionary with a
|
|
key some subset of graph nodes and personalization value each of those.
|
|
At least one personalization value must be non-zero.
|
|
If not specfiied, a nodes personalization value will 1/N where N is the
|
|
number of nodes in G.
|
|
|
|
max_iter : integer, optional
|
|
Maximum number of iterations in power method eigenvalue solver.
|
|
|
|
tol : float, optional
|
|
Error tolerance used to check convergence in power method solver.
|
|
|
|
nstart : dictionary, optional
|
|
Starting value of PageRank iteration for each node.
|
|
|
|
weight : key, optional
|
|
Edge data key to use as weight. If None weights are set to 1.
|
|
|
|
dangling: dict, optional
|
|
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
|
any outedges. The dict key is the node the outedge points to and the dict
|
|
value is the weight of that outedge. By default, dangling nodes are given
|
|
outedges according to the personalization vector (uniform if not
|
|
specified) This must be selected to result in an irreducible transition
|
|
matrix (see notes under google_matrix). It may be common to have the
|
|
dangling dict to be the same as the personalization dict.
|
|
|
|
Returns
|
|
-------
|
|
pagerank : dictionary
|
|
Dictionary of nodes with PageRank as value
|
|
|
|
Notes
|
|
-----
|
|
The eigenvector calculation uses power iteration with a SciPy
|
|
sparse matrix representation.
|
|
|
|
Raises
|
|
------
|
|
PowerIterationFailedConvergence
|
|
If the algorithm fails to converge to the specified tolerance
|
|
within the specified number of iterations of the power iteration
|
|
method.
|
|
"""
|
|
|
|
N = len(G)
|
|
if N == 0:
|
|
return {}
|
|
|
|
nodelist = list(G)
|
|
A = nx.to_scipy_sparse_array(G, nodelist=nodelist, weight=weight, dtype=float)
|
|
S = A.sum(axis=1) # S[i] is the sum of the weights of edges going out of node i
|
|
S[S != 0] = 1.0 / S[S != 0] # S[i] is now the sum of the weights of edges going into node i
|
|
Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) # Q is the matrix of edge weights going into each node
|
|
A = Q @ A # A is now the "stochastic matrix"
|
|
|
|
# initial vector
|
|
if nstart is None: # if no initial vector is specified, start with a uniform vector
|
|
x = np.repeat(1.0 / N, N) # x is the vector of PageRank values
|
|
else: # if an initial vector is specified, normalize it
|
|
x = np.array([nstart.get(n, 0) for n in nodelist], dtype=float) # x is the vector of PageRank values
|
|
x /= x.sum() # normalize x
|
|
|
|
# Personalization vector
|
|
if personalization is None: # if no personalization vector is specified, use a uniform vector
|
|
p = np.repeat(1.0 / N, N) # p is the personalization vector
|
|
else: # if a personalization vector is specified, normalize it
|
|
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float) # p is the personalization vector
|
|
if p.sum() == 0: # if the personalization vector is all zeros, use a uniform vector
|
|
raise ZeroDivisionError
|
|
p /= p.sum() # normalize p
|
|
# Dangling nodes
|
|
if dangling is None: # if no dangling nodes are specified, use a uniform vector
|
|
dangling_weights = p # dangling_weights is the vector of dangling node weights
|
|
else:
|
|
# Convert the dangling dictionary into an array in nodelist order
|
|
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float) # dangling_weights is the vector of dangling node weights
|
|
dangling_weights /= dangling_weights.sum() # normalize dangling_weights
|
|
is_dangling = np.where(S == 0)[0] # is_dangling is the list of dangling nodes
|
|
|
|
# power iteration: make up to max_iter iterations
|
|
iter = 1
|
|
for _ in range(max_iter):
|
|
iter += 1
|
|
xlast = x # xlast is the previous vector of PageRank values
|
|
x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p # x is the current vector of PageRank values
|
|
# check convergence, l1 norm
|
|
err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values
|
|
# print("Iteration: ", _, "\r", end="")
|
|
|
|
if err < N * tol: # if the error is small enough, stop iterating
|
|
return dict(zip(nodelist, map(float, x))), iter, tol # return the current vector of PageRank values'
|
|
|
|
# this is a failure to converges
|
|
raise nx.PowerIterationFailedConvergence(max_iter)
|
|
|
|
|
|
def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=10000, tol=1.0e-9):
|
|
|
|
"""
|
|
Compute the PageRank of each node in the graph G. Algorithm 1 in the paper [1].
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph. Undirected graphs will be converted to a directed graph.
|
|
|
|
alphas : list, optional
|
|
A list of alpha values to use in the shifted power method. The default is [0.85, 0.9, 0.95, 0.99].
|
|
|
|
max_iter : integer, optional
|
|
Maximum number of iterations in power method eigenvalue solver.
|
|
|
|
tol : float, optional
|
|
Error tolerance used to check convergence in power method solver.
|
|
|
|
Returns
|
|
-------
|
|
pagerank : sparse matrix
|
|
Each column of the sparse matrix is a pagerank vector for a different alpha value.
|
|
|
|
mv : integer
|
|
The number of matrix-vector multiplications used in the power method
|
|
|
|
Notes
|
|
-----
|
|
The eigenvector calculation uses power iteration with a SciPy sparse matrix representation. The shifted power method is described as Algorithm 1 in the paper located in the sources folders.
|
|
|
|
Raises
|
|
------
|
|
PowerIterationFailedConvergence
|
|
If the algorithm fails to converge to the specified tolerance
|
|
|
|
References
|
|
----------
|
|
[1] Zhao-Li Shen, Meng Su, Bruno Carpentieri, and Chun Wen. Shifted power-gmres method accelerated by extrapolation for solving pagerank with multiple damping factors. Applied Mathematics and Computation, 420:126799, 2022
|
|
|
|
"""
|
|
|
|
N = len(G)
|
|
if N == 0:
|
|
raise nx.NetworkXException("Empty graph.")
|
|
|
|
A, v = google_matrix_sparse(G)
|
|
|
|
# initialize a sparse matrix of dimension N x len(alphas). The cols of this matrix are the page rank vectors for each alpha. Each col is v
|
|
x = sp.sparse.lil_matrix((N, len(alphas)))
|
|
for i in range(len(alphas)):
|
|
# reshape v to a column vector
|
|
v = v.reshape((N,1))
|
|
x[:, [i]] = v
|
|
|
|
# make v sparse
|
|
v = sp.sparse.lil_array(v)
|
|
|
|
mu = A @ v - v # column vector
|
|
|
|
r = sp.sparse.lil_matrix((N, len(alphas))) # residual matrix
|
|
res = np.ones(len(alphas)) # residual norm vector
|
|
|
|
for i in range(len(alphas)):
|
|
r[:,[i]] = alphas[i] * mu # residual vector for the i-th alpha
|
|
# r_i = r[:,[i]].toarray()
|
|
res[i] = sp.sparse.linalg.norm(r[:,[i]]) # residual norm for the i-th alpha
|
|
if res[i] >= tol:
|
|
x[:, [i]] = r[:,[i]] + v # update the i-th column of x
|
|
|
|
mv = 1 # number of matrix-vector multiplications
|
|
for _ in range(max_iter): # starting of the while loop in the paper
|
|
# print("Iteration: ", _)
|
|
err = np.max(res)
|
|
# print("Error: ", err)
|
|
if err < tol:
|
|
print("Convergence reached with, ", mv, " matrix-vector multiplications")
|
|
return x, mv, alphas, tol
|
|
|
|
# print("Iteration: ", _, "\r", end="")
|
|
|
|
mu = A @ mu
|
|
mv += 1
|
|
for i in range(len(alphas)):
|
|
if res[i] >= tol:
|
|
r[:,[i]] = np.power(alphas[i], mv+1) * mu
|
|
# r_i = r[:,[i]].toarray()
|
|
res[i] = sp.sparse.linalg.norm(r[:,[i]])
|
|
|
|
if res[i] >= tol:
|
|
x[:, [i]] = r[:,[i]] + v
|
|
|
|
raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error
|