new main and algo file, working numerical test on algo1

2 years ago · e2826e2a33
parent 097fa7cf9f
commit e2826e2a33
2 changed files with 399 additions and 0 deletions
--- a/algo.py
+++ b/algo.py
@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+
+# Importing the libraries
+import os
+import wget
+import gzip
+import time
+import scipy as sp
+import numpy as np
+import pandas as pd
+import networkx as nx
+from os.path import exists
+from scipy.sparse import *
+import plotly.graph_objs as go
+from typing import Literal
+
+def load_data(dataset: Literal["Stanford", "BerkStan"]) -> nx.Graph:
+
+    """Load the dataset and return a graph.
+
+    Parameters
+    ----------
+    dataset : Literal["Stanford", "BerkStan"]
+        The dataset to load.
+
+    Returns
+    -------
+    nx.Graph
+        The graph of the dataset.
+        data/web-Stanford.txt
+
+    """
+
+    # check if there is a data folder
+    if not exists(os.path.join(os.getcwd(), "data")):
+        os.mkdir(os.path.join(os.getcwd(), "data"))
+
+    # Download the dataset
+    if not exists(f"data/Web-{dataset}.txt.gz"):
+        print(f"\nDownloading the dataset {dataset}...")
+        wget.download(f"http://snap.stanford.edu/data/web-{dataset}.txt.gz", out=f"data/Web-{dataset}.txt.gz")
+    else:
+        print(f"\nThe dataset {dataset} is already downloaded.")
+
+    # unzip the dataset
+    if not exists(f"data/Web-{dataset}.txt"):
+        print(f"\nUnzipping the dataset {dataset}...")
+        with gzip.open(f"data/Web-{dataset}.txt.gz", "rb") as f_in:
+            with open(f"data/Web-{dataset}.txt", "wb") as f_out:
+                f_out.write(f_in.read())
+
+    # create the graph
+    print(f"\nCreating the graph of the dataset {dataset}...\n")
+    G_dataset = nx.read_edgelist(f"data/Web-{dataset}.txt", create_using=nx.DiGraph(), nodetype=int)
+
+    return G_dataset
+
+def google_matrix(G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None) -> np.matrix:
+
+
+    """Returns the Google matrix of the graph.
+
+    Parameters
+    ----------
+    G : graph
+      A NetworkX graph.  Undirected graphs will be converted to a directed
+      graph with two directed edges for each undirected edge.
+
+    alpha : float
+      The damping factor.
+
+    personalization: dict, optional
+      The "personalization vector" consisting of a dictionary with a
+      key some subset of graph nodes and personalization value each of those.
+      At least one personalization value must be non-zero.
+      If not specfiied, a nodes personalization value will be zero.
+      By default, a uniform distribution is used.
+
+    nodelist : list, optional
+      The rows and columns are ordered according to the nodes in nodelist.
+      If nodelist is None, then the ordering is produced by G.nodes().
+
+    weight : key, optional
+      Edge data key to use as weight.  If None weights are set to 1.
+
+    dangling: dict, optional
+      The outedges to be assigned to any "dangling" nodes, i.e., nodes without
+      any outedges. The dict key is the node the outedge points to and the dict
+      value is the weight of that outedge. By default, dangling nodes are given
+      outedges according to the personalization vector (uniform if not
+      specified) This must be selected to result in an irreducible transition
+      matrix (see notes below). It may be common to have the dangling dict to
+      be the same as the personalization dict.
+
+    Returns
+    -------
+    A : NumPy matrix
+       Google matrix of the graph
+
+    Notes
+    -----
+    The matrix returned represents the transition matrix that describes the
+    Markov chain used in PageRank. For PageRank to converge to a unique
+    solution (i.e., a unique stationary distribution in a Markov chain), the
+    transition matrix must be irreducible. In other words, it must be that
+    there exists a path between every pair of nodes in the graph, or else there
+    is the potential of "rank sinks."
+
+    """
+    if nodelist is None:
+        nodelist = list(G)
+
+    A = nx.to_numpy_array(G, nodelist=nodelist, weight=weight)
+    N = len(G)
+    if N == 0:
+        # TODO: Remove np.asmatrix wrapper in version 3.0
+        return np.asmatrix(A)
+
+    # Personalization vector
+    if personalization is None:
+        p = np.repeat(1.0 / N, N)
+    else:
+        p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
+        if p.sum() == 0:
+            raise ZeroDivisionError
+        p /= p.sum()
+
+    # Dangling nodes
+    if dangling is None:
+        dangling_weights = p
+    else:
+        # Convert the dangling dictionary into an array in nodelist order
+        dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
+        dangling_weights /= dangling_weights.sum()
+    dangling_nodes = np.where(A.sum(axis=1) == 0)[0]
+
+    # Assign dangling_weights to any dangling nodes (nodes with no out links)
+    A[dangling_nodes] = dangling_weights
+
+    A /= A.sum(axis=1)[:, np.newaxis]  # Normalize rows to sum to 1
+
+    return np.asmatrix(alpha * A + (1 - alpha) * p)
+
+def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", dangling=None):
+    """Returns the PageRank of the nodes in the graph.
+
+    PageRank computes a ranking of the nodes in the graph G based on
+    the structure of the incoming links. It was originally designed as
+    an algorithm to rank web pages.
+
+    Parameters
+    ----------
+    G : graph
+      A NetworkX graph.  Undirected graphs will be converted to a directed
+      graph with two directed edges for each undirected edge.
+
+    alpha : float, optional
+      Damping parameter for PageRank, default=0.85.
+
+    personalization: dict, optional
+      The "personalization vector" consisting of a dictionary with a
+      key some subset of graph nodes and personalization value each of those.
+      At least one personalization value must be non-zero.
+      If not specfiied, a nodes personalization value will be zero.
+      By default, a uniform distribution is used.
+
+    weight : key, optional
+      Edge data key to use as weight.  If None weights are set to 1.
+
+    dangling: dict, optional
+      The outedges to be assigned to any "dangling" nodes, i.e., nodes without
+      any outedges. The dict key is the node the outedge points to and the dict
+      value is the weight of that outedge. By default, dangling nodes are given
+      outedges according to the personalization vector (uniform if not
+      specified) This must be selected to result in an irreducible transition
+      matrix (see notes under google_matrix). It may be common to have the
+      dangling dict to be the same as the personalization dict.
+
+    Returns
+    -------
+    pagerank : dictionary
+       Dictionary of nodes with PageRank as value.
+
+
+    Notes
+    -----
+    The eigenvector calculation uses NumPy's interface to the LAPACK
+    eigenvalue solvers.  This will be the fastest and most accurate
+    for small graphs.
+
+    """
+    if len(G) == 0:
+        return {}
+    M = google_matrix(
+        G, alpha, personalization=personalization, weight=weight, dangling=dangling
+    )
+    # use numpy LAPACK solver
+    eigenvalues, eigenvectors = np.linalg.eig(M.T)
+    ind = np.argmax(eigenvalues)
+    # eigenvector of largest eigenvalue is at ind, normalized
+    largest = np.array(eigenvectors[:, ind]).flatten().real
+    norm = largest.sum()
+    return dict(zip(G, map(float, largest / norm)))
+
+def pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-9, nstart=None, weight="weight", dangling=None,):
+
+    """
+    Returns the PageRank of the nodes in the graph.
+
+        PageRank computes a ranking of the nodes in the graph G based on
+        the structure of the incoming links. It was originally designed as
+        an algorithm to rank web pages.
+
+        Parameters
+        ----------
+        G : graph
+        A NetworkX graph.  Undirected graphs will be converted to a directed
+        graph with two directed edges for each undirected edge.
+
+        alpha : float, optional
+        Damping parameter for PageRank, default=0.85.
+
+        personalization: dict, optional
+        The "personalization vector" consisting of a dictionary with a
+        key some subset of graph nodes and personalization value each of those.
+        At least one personalization value must be non-zero.
+        If not specfiied, a nodes personalization value will be zero.
+        By default, a uniform distribution is used.
+
+        max_iter : integer, optional
+        Maximum number of iterations in power method eigenvalue solver.
+
+        tol : float, optional
+        Error tolerance used to check convergence in power method solver.
+
+        nstart : dictionary, optional
+        Starting value of PageRank iteration for each node.
+
+        weight : key, optional
+        Edge data key to use as weight.  If None weights are set to 1.
+
+        dangling: dict, optional
+        The outedges to be assigned to any "dangling" nodes, i.e., nodes without
+        any outedges. The dict key is the node the outedge points to and the dict
+        value is the weight of that outedge. By default, dangling nodes are given
+        outedges according to the personalization vector (uniform if not
+        specified) This must be selected to result in an irreducible transition
+        matrix (see notes under google_matrix). It may be common to have the
+        dangling dict to be the same as the personalization dict.
+
+        Returns
+        -------
+        pagerank : dictionary
+        Dictionary of nodes with PageRank as value
+
+        Notes
+        -----
+        The eigenvector calculation uses power iteration with a SciPy
+        sparse matrix representation.
+
+        Raises
+        ------
+        PowerIterationFailedConvergence
+            If the algorithm fails to converge to the specified tolerance
+            within the specified number of iterations of the power iteration
+            method.
+    """
+
+    N = len(G)
+    if N == 0:
+        return {}
+
+    nodelist = list(G)
+    A = nx.to_scipy_sparse_array(G, nodelist=nodelist, weight=weight, dtype=float)
+    S = A.sum(axis=1) # S[i] is the sum of the weights of edges going out of node i
+    S[S != 0] = 1.0 / S[S != 0] # S[i] is now the sum of the weights of edges going into node i
+    Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) # Q is the matrix of edge weights going into each node
+    A = Q @ A # A is now the "stochastic matrix"
+
+    # initial vector
+    if nstart is None: # if no initial vector is specified, start with a uniform vector
+        x = np.repeat(1.0 / N, N) # x is the vector of PageRank values
+    else: # if an initial vector is specified, normalize it
+        x = np.array([nstart.get(n, 0) for n in nodelist], dtype=float) # x is the vector of PageRank values
+        x /= x.sum() # normalize x
+
+    # Personalization vector
+    if personalization is None: # if no personalization vector is specified, use a uniform vector
+        p = np.repeat(1.0 / N, N) # p is the personalization vector
+    else: # if a personalization vector is specified, normalize it
+        p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float) # p is the personalization vector
+        if p.sum() == 0: # if the personalization vector is all zeros, use a uniform vector
+            raise ZeroDivisionError
+        p /= p.sum() # normalize p
+    # Dangling nodes
+    if dangling is None: # if no dangling nodes are specified, use a uniform vector
+        dangling_weights = p # dangling_weights is the vector of dangling node weights
+    else:
+        # Convert the dangling dictionary into an array in nodelist order
+        dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float) # dangling_weights is the vector of dangling node weights
+        dangling_weights /= dangling_weights.sum() # normalize dangling_weights
+    is_dangling = np.where(S == 0)[0] # is_dangling is the list of dangling nodes
+
+    # power iteration: make up to max_iter iterations
+    iter = 1
+    for _ in range(max_iter):
+        iter += 1
+        xlast = x # xlast is the previous vector of PageRank values
+        x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p # x is the current vector of PageRank values
+        # check convergence, l1 norm
+        err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values
+        if err < N * tol: # if the error is small enough, stop iterating
+            return dict(zip(nodelist, map(float, x))), iter, alpha, tol # return the current vector of PageRank values
+    raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error
+
+def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=100, tol=1.0e-9):
+
+    N = len(G)
+    if N == 0:
+        return {}
+
+
+    nodelist = list(G)
+    A = nx.to_scipy_sparse_array(G, nodelist=nodelist, dtype=float)
+    S = A.sum(axis=1) # S[i] is the sum of the weights of edges going out of node i
+    S[S != 0] = 1.0 / S[S != 0] # S[i] is now the sum of the weights of edges going into node i
+    Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) # Q is the matrix of edge weights going into each node
+    A = Q
+
+
+    x = np.repeat(1.0 / N, N) # x is the vector of PageRank values
+    v = np.repeat(1.0 / N, N) # p is the personalization vector
+
+    mu = A @ v - v # mu is the vector of PageRank values for the random walk with restart
+
+    for i in range(len(alphas)):
+        # create a vector r of len(alphas) where r[i] = alpha[i] * mu
+        r = alphas[i] * mu
+        Res = np.linalg.norm(r, 2)
+
+        if Res >= tol:
+            x = r + v # update x
+
+    iter = 1
+    for _ in range(max_iter):
+        xlast = x
+        iter += 1
+        mu = A @ x - x
+        for i in range(len(alphas)):
+            r = alphas[i]**(iter+1) * mu
+            Res = np.linalg.norm(r, 2)
+            if Res >= tol:
+                x = r + x
+
+        err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values
+
+        if err < tol:
+            return dict(zip(nodelist, map(float, x))), iter, alphas, tol
+
+    raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error
--- a/main.py
+++ b/main.py
@ -0,0 +1,39 @@
+from algo import *
+
+def choice(dataset_number):
+
+    if dataset_number == 1:
+        # run the algorithm on Web-Stanford dataset
+        G = load_data("Stanford")
+        return G
+    elif dataset_number == 2:
+        # run the algorithm on Web-BerkStan dataset
+        G = load_data("BerkStan")
+        return G
+    else:
+        raise ValueError("Invalid choice. Please choose a valid option.")
+
+# main function
+if __name__ == "__main__":
+
+    dataset_number = int(input("Choose the dataset to work with. The options are:\n\t [1] Web-Stanford\n\t [2] Web-BerkStan\nType your number of choice: "))
+
+    G = choice(dataset_number)
+
+    start1 = time.time()
+    prank, iterations, alpha, tol = pagerank(G)
+    end1 = time.time()
+    print("STANDARD PAGERANK ALGORITHM\n")
+    print("\tCPU time (s):", round(end1 - start1,1))
+    print("\tIterations:", iterations)
+    print("\tAlpha:", alpha)
+    print("\tTolerance:", tol)
+
+    start2 = time.time()
+    shifted_pagerank, iterations, alphas, tol = shifted_pow_pagerank(G)
+    end2 = time.time()
+    print("\nSHIFTED PAGERANK ALGORITHM\n")
+    print("\tCPU time (s):", round(end2 - start2,1))
+    print("\tIterations:", iterations)
+    print("\tAlphas:", alphas)
+    print("\tTolerance:", tol)