now algo1 works perfectly

2 years ago · 55d9631ee4
parent 563a09b23c
commit 55d9631ee4
2 changed files with 82 additions and 28 deletions
--- a/algo.py
+++ b/algo.py
@ -52,6 +52,8 @@ def load_data(dataset: Literal["Stanford", "BerkStan"]) -> nx.Graph:
    # create the graph
    print(f"\nCreating the graph of the dataset {dataset}...\n")
    G_dataset = nx.read_edgelist(f"data/Web-{dataset}.txt", create_using=nx.DiGraph(), nodetype=int)
+    print(f"\tNumber of nodes: {G_dataset.number_of_nodes()}")
+    print(f"\tNumber of edges: {G_dataset.number_of_edges()}")

    return G_dataset

@ -202,7 +204,7 @@ def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", danglin
    norm = largest.sum()
    return dict(zip(G, map(float, largest / norm)))

-def pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-9, nstart=None, weight="weight", dangling=None,):
+def pagerank(G, alpha=0.85, personalization=None, max_iter=200, tol=1.0e-9, nstart=None, weight="weight", dangling=None,):

    """
    Returns the PageRank of the nodes in the graph.
@ -310,15 +312,52 @@ def pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-9, nsta
        # check convergence, l1 norm
        err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values
        if err < N * tol: # if the error is small enough, stop iterating
-            return dict(zip(nodelist, map(float, x))), iter, alpha, tol # return the current vector of PageRank values
-    raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error
+            return dict(zip(nodelist, map(float, x))), iter, tol # return the current vector of PageRank values'
+
+    # other wise, return a Null dictionary, the number of iterations, and the tolerance
+    # this is a failure to convergeS
+
+    return {}, iter, tol
+
+def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=200, tol=1.0e-9):
+
+    """
+    Compute the PageRank of each node in the graph G.
+
+    Parameters
+    ----------
+    G : graph
+        A NetworkX graph. Undirected graphs will be converted to a directed graph.
+
+    alphas : list, optional
+        A list of alpha values to use in the shifted power method. The default is [0.85, 0.9, 0.95, 0.99].
+
+    max_iter : integer, optional
+        Maximum number of iterations in power method eigenvalue solver.
+
+    tol : float, optional
+        Error tolerance used to check convergence in power method solver.

-def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=100, tol=1.0e-9):
+    Returns
+    -------
+    pagerank : dictionary
+        Dictionary of nodes with PageRank as value
+
+    mv : integer
+        The number of matrix-vector multiplications used in the power method
+
+    Notes
+    -----
+    The eigenvector calculation uses power iteration with a SciPy sparse matrix representation. The shifted power method is described as Algorithm 1 in the paper located in the sources folders.
+
+    """

    N = len(G)
    if N == 0:
        return {}

+    # initialize a random sparse matrix of dimension N x len(alphas). The cols of this matrix are the page rank vectors for each alpha.
+    x = sp.sparse.random(N, len(alphas), density=0.01, format="lil", dtype=float)

    nodelist = list(G)
    A = nx.to_scipy_sparse_array(G, nodelist=nodelist, dtype=float)
@ -327,34 +366,30 @@ def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=100, tol=1.
    Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) # Q is the matrix of edge weights going into each node
    A = Q

-
-    x = np.repeat(1.0 / N, N) # x is the vector of PageRank values
    v = np.repeat(1.0 / N, N) # p is the personalization vector
-
-    mu = A @ v - v # mu is the vector of PageRank values for the random walk with restart
+    mu = A @ v - v

    for i in range(len(alphas)):
-        # create a vector r of len(alphas) where r[i] = alpha[i] * mu
-        r = alphas[i] * mu
-        Res = np.linalg.norm(r, 2)
+        r = alphas[i] * mu # residual vector
+        Res = np.linalg.norm(r, 2) # residual norm

        if Res >= tol:
-            x = r + v # update x
+            x[:, [i]] = r + v # update the i-th column of x

-    iter = 1
+    mv = 0 # number of matrix-vector multiplications
    for _ in range(max_iter):
-        xlast = x
-        iter += 1
-        mu = A @ x - x
+        mv += 1
+        mu = A @ mu
        for i in range(len(alphas)):
-            r = alphas[i]**(iter+1) * mu
-            Res = np.linalg.norm(r, 2)
            if Res >= tol:
-                x = r + x
+                r = pow(alphas[i], mv+1) * mu
+                Res = np.linalg.norm(r,2)

-        err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values
+                if Res >= tol:
+                    x[:, [i]] = r + v

-        if err < tol:
-            return dict(zip(nodelist, map(float, x))), iter, alphas, tol
+            err = np.absolute(r).max()
+            if err < tol:
+                 return x, mv, alphas, tol

    raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error
--- a/main.py
+++ b/main.py
@ -19,21 +19,40 @@ if __name__ == "__main__":
    dataset_number = int(input("Choose the dataset to work with. The options are:\n\t [1] Web-Stanford\n\t [2] Web-BerkStan\nType your number of choice: "))

    G = choice(dataset_number)
+    alphas = [0.85, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
+
+
+    ### STANDARD PAGERANK ALGORITHM ###
+    iter_dict = dict.fromkeys(alphas, 0)
+    list_of_pageranks = [] # list of pageranks dict for each alpha

    start1 = time.time()
-    prank, iterations, alpha, tol = pagerank(G)
+    for alpha in alphas:
+        x, iter, tol = pagerank(G, alpha, tol=1e-9)
+        iter_dict[alpha] = iter
+        list_of_pageranks.append(x)
    end1 = time.time()
-    print("STANDARD PAGERANK ALGORITHM\n")
+
+    total_iter = sum(iter_dict.values())
+
+    print("\nSTANDARD PAGERANK ALGORITHM\n")
    print("\tCPU time (s):", round(end1 - start1,1))
-    print("\tIterations:", iterations)
-    print("\tAlpha:", alpha)
+    print("\tMatrix-vector multiplications:", total_iter)
+    print("\tAlpha:", alphas)
    print("\tTolerance:", tol)
+    print()
+
+    # check if there are entries in the list of pageranks that are empty dict, if so, print the corresponding alpha saying that the algorithm did not converge for that alpha
+    for i in range(len(list_of_pageranks)):
+        if not list_of_pageranks[i]:
+            print("The algorithm did not converge for alpha =", alphas[i])

+    ### SHIFTED PAGERANK ALGORITHM ###
    start2 = time.time()
-    shifted_pagerank, iterations, alphas, tol = shifted_pow_pagerank(G)
+    x, mv, alphas, tol = shifted_pow_pagerank(G, alphas, tol=1e-9)
    end2 = time.time()
    print("\nSHIFTED PAGERANK ALGORITHM\n")
    print("\tCPU time (s):", round(end2 - start2,1))
-    print("\tIterations:", iterations)
+    print("\tMatrix-vector multiplications:", mv)
    print("\tAlphas:", alphas)
    print("\tTolerance:", tol)