big restyle, last fixes before final release

main
Luca Lombardo 2 years ago
parent 248045eb54
commit 5808251a09

333
algo4_testing.ipynb vendored

@ -1,333 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import networkx as nx\n",
"import time\n",
"import math\n",
"import pandas as pd\n",
"import scipy as sp\n",
"import plotly.express as px\n",
"import plotly.graph_objs as go\n",
"from scipy.sparse import *\n",
"from scipy import linalg\n",
"from scipy.sparse.linalg import norm\n",
"from scipy.optimize import least_squares"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Arnoldi \n",
"\n",
"This is a copy of the algorithm defined and tested in the notebook `algo2_testing`. It's an implementation of the Algorithm 2 from the paper. It's needed in this notebook since this function is called by the `algo4` function. It's implemented to return exactly what's needed in the `algo4` function.\n",
"\n",
"Everything will be reorganized in the main.py file once everything is working."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def Arnoldi(A,v0,m):\n",
" v = v0\n",
" beta = np.linalg.norm(v)\n",
" v = v/beta\n",
" H = sp.sparse.lil_matrix((m+1,m)) \n",
" V = sp.sparse.lil_matrix((A.shape[0],m+1))\n",
" V[:,0] = v # each column of V is a vector v\n",
"\n",
" for j in range(m):\n",
" w = A @ v \n",
" for i in range(j):\n",
" H[i,j] = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory\n",
" w = w - H[i,j]*v \n",
" \n",
" H[j+1,j] = np.linalg.norm(w)\n",
"\n",
" if H[j+1,j] == 0:\n",
" # print(\"Arnoldi breakdown\")\n",
" m = j\n",
" v = 0\n",
" break\n",
" else:\n",
" if j < m-1:\n",
" v = w/H[j+1,j]\n",
" V[:,j+1] = v\n",
"\n",
" return V, H, beta, j "
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Algorithm 4 testing\n",
"\n",
"This algorithm is based on the \"Algorithm 4\" of the paper, the pseudocode provided by the authors is the following \n",
"\n",
"![](https://i.imgur.com/H92fru7.png)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"def compute_gamma(res, a, k): # function to compute gamma\n",
" gamma = np.ones(len(a))\n",
" for i in range(len(a)):\n",
" if i != k:\n",
" gamma[i] = (res[i]*a[k])/(res[k]*a[i])\n",
" else:\n",
" gamma[i] = 0\n",
" return gamma"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Basic test case with random numbers to test the algorithm."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"n = 1000\n",
"m = 1100\n",
"tau = 1e-6\n",
"a = [0.85, 0.88, 0.9, 0.95]\n",
"\n",
"x = sp.sparse.lil_matrix((n,1))\n",
"x[0,0] = 1\n",
"\n",
"# generate a random graph\n",
"G = nx.gnp_random_graph(n, 0.1)\n",
"v = np.repeat(1.0 / 1000, 1000) # p is the personalization vector\n",
"v = v.reshape(v.shape[0],1)\n",
"\n",
"A = nx.to_scipy_sparse_array(G, dtype=float)\n",
"S = A.sum(axis=1) # S[i] is the sum of the weights of edges going out of node i\n",
"S[S != 0] = 1.0 / S[S != 0] # S[i] is now the sum of the weights of edges going into node i\n",
"Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) # Q is the matrix of edge weights"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"def Algo4(Pt, v, m, a: list, tau, maxit: int, x):\n",
"\n",
" mv, iter = 0, 1 # mv is the number of matrix-vector products, iter is the number of iterations\n",
" \n",
" # initialize x as a random sparse matrix. Each col is the pagerank vector for a different alpha\n",
" x = sp.sparse.lil_matrix((Pt.shape[0], len(a)))\n",
"\n",
"\n",
" # compute the residual vector, it is a matrix of size (n, len(a)). Each col is the residual vector for a different alpha. \n",
" I = sp.sparse.eye(Pt.shape[0], Pt.shape[1], format='lil')\n",
" r = sp.sparse.lil_matrix((Pt.shape[0], len(a)))\n",
" res = np.zeros(len(a))\n",
"\n",
" for i in range(len(a)):\n",
" r[:,[i]] = sp.sparse.linalg.spsolve(I - a[i]*Pt, v)\n",
" col = r[:,[i]].toarray()\n",
" res[i] = np.linalg.norm(col)\n",
"\n",
" for _ in range(maxit):\n",
" # check if we have converged\n",
" err = np.absolute(np.amax(res))\n",
" if err < tau:\n",
" print(\"Computation ended successfully in \", iter, \" iterations and \", mv, \" matrix-vector products.\")\n",
" return x, iter, mv\n",
"\n",
" print(\"\\niter = \", iter)\n",
" print(\"res: \", res)\n",
" print(\"err = \", err)\n",
"\n",
"\n",
" k = int(np.argmax(res))\n",
" print(\"k = \", k)\n",
" gamma = compute_gamma(res, a, k)\n",
" \n",
" # Run Arnoldi\n",
" r_k = r[:,[k]].toarray()\n",
" A_arnoldi = (1/a[k])*I - Pt\n",
" V, H, beta, j = Arnoldi((1/a[k])*I - Pt, r_k, m)\n",
" H = H[:-1,:]\n",
" V = V[:,:-1]\n",
" mv = mv + j\n",
"\n",
" H_e1 = np.zeros(H.shape[0])\n",
" H_e1[0] = 1\n",
"\n",
" # compute y as the minimizer of || beta*e1 - Hy ||_2 using the least squares method\n",
" y = sp.sparse.lil_matrix((H.shape[1],len(a)))\n",
" y[:,[k]] = sp.sparse.linalg.lsqr(H, beta*H_e1)[0]\n",
" y_k = y[:,[k]].toarray()\n",
"\n",
" # # Update x\n",
" x_new = x\n",
" x_new[:,[k]] = x[:,[k]] + V @ y_k\n",
"\n",
" # Update res[k]\n",
" V_e1 = np.zeros(V.shape[0])\n",
" V_e1[0] = 1\n",
"\n",
" norm_k =np.linalg.norm(beta*V_e1 - V @ y_k)\n",
" res[k] = a[k]*norm_k\n",
"\n",
" # multi shift\n",
" for i in range(len(a)):\n",
" if res[i] >= tau:\n",
" # print(\"res[\", i, \"] is larger than tau = \", tau)\n",
"\n",
" # # Compute H as described in the paper\n",
" # H_k = H[:,[k]].toarray()\n",
" # H_i = H_k + ((1-a[i])/a[i] - (1-a[k])/a[k])\n",
" # H[:,[i]] = H_i\n",
" H = H + ((1-a[i])/a[i] - (1-a[k])/a[k])*sp.sparse.eye(H.shape[0], H.shape[1], format='lil')\n",
"\n",
" # Compute z as described in the paper\n",
" z1 = H_e1*beta\n",
" z1 = z1.reshape(z1.shape[0],1)\n",
" z2 = H @ y[:,[1]]\n",
" z2 = z2.reshape(z2.shape[0],1)\n",
" z = z1 - z2\n",
"\n",
" # Solve the linear system \n",
" A = sp.sparse.hstack([H, z])\n",
" b = (beta*H_e1)\n",
" b = b.reshape(b.shape[0],1)\n",
" # use the least squares method to solve the linear system\n",
" to_split = sp.sparse.linalg.lsqr(A, b)[0]\n",
" \n",
" # the last element of y_to_split is the last element of gamma[i], the other elements are the elements of y[:[i]]\n",
" y[:,[i]] = to_split[:-1]\n",
" gamma[i] = to_split[-1]\n",
"\n",
" # update x\n",
" x_new[:,i] = x[:,i] + V @ y[:,[i]]\n",
"\n",
" # update the residual vector\n",
" res[i] = (a[i]/a[k])*pow(gamma[i], i)*res[k]\n",
"\n",
" else:\n",
" if res[i] < tau:\n",
" print(\"res[\", i, \"] is smaller than tau = \", tau, \" at iteration \", iter)\n",
"\n",
" iter = iter + 1\n",
" x = x_new\n",
"\n",
" raise Exception('Maximum number of iterations reached')"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"iter = 1\n",
"res: [0.03189738 0.03190716 0.03191369 0.03193001]\n",
"err = 0.031930006625941795\n",
"k = 0\n",
"\n",
"iter = 2\n",
"res: [1.11728737e+00 8.26005227e-04 5.55288870e-10 4.81520495e-13]\n",
"err = 1.1172873666904701\n",
"k = 3\n",
"res[ 2 ] is smaller than tau = 1e-06 at iteration 2\n",
"\n",
"iter = 3\n",
"res: [1.17714008e+00 1.29941354e-03 5.55288870e-10 1.93969263e-18]\n",
"err = 1.1771400826095457\n",
"k = 3\n",
"res[ 2 ] is smaller than tau = 1e-06 at iteration 3\n",
"\n",
"iter = 4\n",
"res: [1.17714008e+00 1.29941354e-03 5.55288870e-10 1.93969263e-18]\n",
"err = 1.1771400826095457\n",
"k = 3\n",
"res[ 2 ] is smaller than tau = 1e-06 at iteration 4\n",
"\n",
"iter = 5\n",
"res: [1.17714008e+00 1.29941354e-03 5.55288870e-10 1.93969263e-18]\n",
"err = 1.1771400826095457\n",
"k = 3\n",
"res[ 2 ] is smaller than tau = 1e-06 at iteration 5\n",
"\n",
"iter = 6\n",
"res: [1.17714008e+00 1.29941354e-03 5.55288870e-10 1.93969263e-18]\n",
"err = 1.1771400826095457\n",
"k = 3\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_13660/3677688099.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAlgo4\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mQ\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtau\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/tmp/ipykernel_13660/2503933778.py\u001b[0m in \u001b[0;36mAlgo4\u001b[0;34m(Pt, v, m, a, tau, maxit, x)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mr_k\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mA_arnoldi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mI\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mPt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0mV\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mH\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbeta\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mArnoldi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mI\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mPt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr_k\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0mH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mH\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0mV\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mV\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/tmp/ipykernel_13660/113321894.py\u001b[0m in \u001b[0;36mArnoldi\u001b[0;34m(A, v0, m)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mA\u001b[0m \u001b[0;34m@\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mH\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m \u001b[0;34m@\u001b[0m \u001b[0mw\u001b[0m \u001b[0;31m# tmp is a 1x1 matrix, so it's O(1) in memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mw\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mH\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/scipy/sparse/_lil.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, x)\u001b[0m\n\u001b[1;32m 326\u001b[0m isinstance(key[1], INT_TYPES)):\n\u001b[1;32m 327\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Trying to assign a sequence to an item\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_intXint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"x, iter, mv = Algo4(Q, v, m, a, tau, 100, x)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.6 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -1,58 +0,0 @@
from algo import *
def choice(dataset_number):
if dataset_number == 1:
# run the algorithm on Web-Stanford dataset
G = load_data("Stanford")
return G
elif dataset_number == 2:
# run the algorithm on Web-BerkStan dataset
G = load_data("BerkStan")
return G
else:
raise ValueError("Invalid choice. Please choose a valid option.")
# main function
if __name__ == "__main__":
dataset_number = int(input("Choose the dataset to work with. The options are:\n\t [1] Web-Stanford\n\t [2] Web-BerkStan\nType your number of choice: "))
G = choice(dataset_number)
alphas = [0.85, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
### STANDARD PAGERANK ALGORITHM ###
iter_dict = dict.fromkeys(alphas, 0)
list_of_pageranks = [] # list of pageranks dict for each alpha
start1 = time.time()
for alpha in alphas:
x, iter, tol = pagerank(G, alpha, tol=1e-9)
iter_dict[alpha] = iter
list_of_pageranks.append(x)
end1 = time.time()
total_iter = sum(iter_dict.values())
print("\nSTANDARD PAGERANK ALGORITHM\n")
print("\tCPU time (s):", round(end1 - start1,1))
print("\tMatrix-vector multiplications:", total_iter)
print("\tAlpha:", alphas)
print("\tTolerance:", tol)
print()
# check if there are entries in the list of pageranks that are empty dict, if so, print the corresponding alpha saying that the algorithm did not converge for that alpha
for i in range(len(list_of_pageranks)):
if not list_of_pageranks[i]:
print("The algorithm did not converge for alpha =", alphas[i])
### SHIFTED PAGERANK ALGORITHM ###
start2 = time.time()
x, mv, alphas, tol = shifted_pow_pagerank(G, alphas, tol=1e-9)
end2 = time.time()
print("\nSHIFTED PAGERANK ALGORITHM\n")
print("\tCPU time (s):", round(end2 - start2,1))
print("\tMatrix-vector multiplications:", mv)
print("\tAlphas:", alphas)
print("\tTolerance:", tol)

@ -14,8 +14,7 @@ from scipy.sparse import *
import plotly.graph_objs as go import plotly.graph_objs as go
from typing import Literal from typing import Literal
def load_data(dataset: Literal["Stanford", "BerkStan"]) -> nx.Graph: def load_data(dataset: Literal["Stanford", "NotreDame", "BerkStan"]) -> nx.Graph:
"""Load the dataset and return a graph. """Load the dataset and return a graph.
Parameters Parameters
@ -35,23 +34,26 @@ def load_data(dataset: Literal["Stanford", "BerkStan"]) -> nx.Graph:
if not exists(os.path.join(os.getcwd(), "data")): if not exists(os.path.join(os.getcwd(), "data")):
os.mkdir(os.path.join(os.getcwd(), "data")) os.mkdir(os.path.join(os.getcwd(), "data"))
if dataset not in ["Stanford", "NotreDame", "BerkStan"]:
raise ValueError("Invalid dataset. Please choose a valid dataset.")
# Download the dataset # Download the dataset
if not exists(f"data/Web-{dataset}.txt.gz"): if not exists(f"../data/Web-{dataset}.txt.gz"):
print(f"\nDownloading the dataset {dataset}...") print(f"\nDownloading the dataset {dataset}...")
wget.download(f"http://snap.stanford.edu/data/web-{dataset}.txt.gz", out=f"data/Web-{dataset}.txt.gz") wget.download(f"http://snap.stanford.edu/data/web-{dataset}.txt.gz", out=f"data/Web-{dataset}.txt.gz")
else: else:
print(f"\nThe dataset {dataset} is already downloaded.") print(f"\nThe dataset {dataset} is already downloaded.")
# unzip the dataset # unzip the dataset
if not exists(f"data/Web-{dataset}.txt"): if not exists(f"../data/Web-{dataset}.txt"):
print(f"\nUnzipping the dataset {dataset}...") print(f"\nUnzipping the dataset {dataset}...")
with gzip.open(f"data/Web-{dataset}.txt.gz", "rb") as f_in: with gzip.open(f"../data/Web-{dataset}.txt.gz", "rb") as f_in:
with open(f"data/Web-{dataset}.txt", "wb") as f_out: with open(f"../data/Web-{dataset}.txt", "wb") as f_out:
f_out.write(f_in.read()) f_out.write(f_in.read())
# create the graph # create the graph
print(f"\nCreating the graph of the dataset {dataset}...\n") print(f"\nCreating the graph of the dataset {dataset}...\n")
G_dataset = nx.read_edgelist(f"data/Web-{dataset}.txt", create_using=nx.DiGraph(), nodetype=int) G_dataset = nx.read_edgelist(f"../data/Web-{dataset}.txt", create_using=nx.DiGraph(), nodetype=int)
print(f"\tNumber of nodes: {G_dataset.number_of_nodes()}") print(f"\tNumber of nodes: {G_dataset.number_of_nodes()}")
print(f"\tNumber of edges: {G_dataset.number_of_edges()}") print(f"\tNumber of edges: {G_dataset.number_of_edges()}")
@ -143,6 +145,103 @@ def google_matrix(G, alpha=0.85, personalization=None, nodelist=None, weight="we
return np.asmatrix(alpha * A + (1 - alpha) * p) return np.asmatrix(alpha * A + (1 - alpha) * p)
def google_matrix_sparse(G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None) -> np.matrix:
"""Returns the Google matrix of the graph.
Parameters
----------
G : graph
A NetworkX graph. Undirected graphs will be converted to a directed
graph with two directed edges for each undirected edge.
alpha : float
The damping factor.
personalization: dict, optional
The "personalization vector" consisting of a dictionary with a
key some subset of graph nodes and personalization value each of those.
At least one personalization value must be non-zero.
If not specfiied, a nodes personalization value will be zero.
By default, a uniform distribution is used.
nodelist : list, optional
The rows and columns are ordered according to the nodes in nodelist.
If nodelist is None, then the ordering is produced by G.nodes().
weight : key, optional
Edge data key to use as weight. If None weights are set to 1.
dangling: dict, optional
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
any outedges. The dict key is the node the outedge points to and the dict
value is the weight of that outedge. By default, dangling nodes are given
outedges according to the personalization vector (uniform if not
specified) This must be selected to result in an irreducible transition
matrix (see notes below). It may be common to have the dangling dict to
be the same as the personalization dict.
Returns
-------
A : NumPy matrix
Google matrix of the graph
Notes
-----
The matrix returned represents the transition matrix that describes the
Markov chain used in PageRank. For PageRank to converge to a unique
solution (i.e., a unique stationary distribution in a Markov chain), the
transition matrix must be irreducible. In other words, it must be that
there exists a path between every pair of nodes in the graph, or else there
is the potential of "rank sinks."
"""
if nodelist is None:
nodelist = list(G)
A = nx.to_scipy_sparse_array(G, nodelist=nodelist, weight=weight, format="lil", dtype=int)
N = len(G)
if N == 0:
return np.asmatrix(A)
# Personalization vector
if personalization is None:
p = np.repeat(1.0 / N, N)
p = sp.sparse.lil_array(p)
else:
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
if p.sum() == 0:
raise ZeroDivisionError
p /= p.sum()
# Dangling nodes
if dangling is None:
dangling_weights = np.ones(N, dtype=int)
dangling_weights = sp.sparse.lil_array(dangling_weights, dtype=int)
else:
# Convert the dangling dictionary into an array in nodelist order
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
dangling_weights /= dangling_weights.sum()
# # Assign dangling_weights to any dangling nodes (nodes with no out links).
# for i in range(N):
# if A[[i],:].sum(axis=1) == 0:
# A[[i],:] = dangling_weights
# replace rows with all zeros with dangling_weights
A[[A.sum(axis=1)==0],:] = dangling_weights
# Normalize rows
row_sums = A.sum(axis=1) # row sums
r_inv = np.power(row_sums.astype(float), -1).flatten() # inverse of row sums
r_inv[np.isinf(r_inv)] = 0.0 # replace inf with 0
R = sp.sparse.diags(r_inv) # create diagonal matrix
A = R.dot(A) # normalize rows
return A, p
def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", dangling=None): def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", dangling=None):
"""Returns the PageRank of the nodes in the graph. """Returns the PageRank of the nodes in the graph.
@ -204,7 +303,7 @@ def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", danglin
norm = largest.sum() norm = largest.sum()
return dict(zip(G, map(float, largest / norm))) return dict(zip(G, map(float, largest / norm)))
def pagerank(G, alpha=0.85, personalization=None, max_iter=200, tol=1.0e-9, nstart=None, weight="weight", dangling=None,): def pagerank(G, alpha=0.85, personalization=None, max_iter=10000, tol=1.0e-9, nstart=None, weight="weight", dangling=None,):
""" """
Returns the PageRank of the nodes in the graph. Returns the PageRank of the nodes in the graph.
@ -226,8 +325,8 @@ def pagerank(G, alpha=0.85, personalization=None, max_iter=200, tol=1.0e-9, nsta
The "personalization vector" consisting of a dictionary with a The "personalization vector" consisting of a dictionary with a
key some subset of graph nodes and personalization value each of those. key some subset of graph nodes and personalization value each of those.
At least one personalization value must be non-zero. At least one personalization value must be non-zero.
If not specfiied, a nodes personalization value will be zero. If not specfiied, a nodes personalization value will 1/N where N is the
By default, a uniform distribution is used. number of nodes in G.
max_iter : integer, optional max_iter : integer, optional
Maximum number of iterations in power method eigenvalue solver. Maximum number of iterations in power method eigenvalue solver.
@ -311,6 +410,8 @@ def pagerank(G, alpha=0.85, personalization=None, max_iter=200, tol=1.0e-9, nsta
x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p # x is the current vector of PageRank values x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p # x is the current vector of PageRank values
# check convergence, l1 norm # check convergence, l1 norm
err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values err = np.absolute(x - xlast).sum() # err is the error between the current and previous vectors of PageRank values
print("Iteration: ", _, "\r", end="")
if err < N * tol: # if the error is small enough, stop iterating if err < N * tol: # if the error is small enough, stop iterating
return dict(zip(nodelist, map(float, x))), iter, tol # return the current vector of PageRank values' return dict(zip(nodelist, map(float, x))), iter, tol # return the current vector of PageRank values'
@ -319,7 +420,7 @@ def pagerank(G, alpha=0.85, personalization=None, max_iter=200, tol=1.0e-9, nsta
return {}, iter, tol return {}, iter, tol
def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=200, tol=1.0e-9): def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=10000, tol=1.0e-9):
""" """
Compute the PageRank of each node in the graph G. Compute the PageRank of each node in the graph G.
@ -354,42 +455,53 @@ def shifted_pow_pagerank(G, alphas=[0.85, 0.9, 0.95, 0.99], max_iter=200, tol=1.
N = len(G) N = len(G)
if N == 0: if N == 0:
return {} raise nx.NetworkXException("Empty graph.")
# initialize a random sparse matrix of dimension N x len(alphas). The cols of this matrix are the page rank vectors for each alpha. A, v = google_matrix_sparse(G)
x = sp.sparse.random(N, len(alphas), density=0.01, format="lil", dtype=float)
nodelist = list(G) # initialize a sparse matrix of dimension N x len(alphas). The cols of this matrix are the page rank vectors for each alpha. Each col is v
A = nx.to_scipy_sparse_array(G, nodelist=nodelist, dtype=float) x = sp.sparse.lil_matrix((N, len(alphas)))
S = A.sum(axis=1) # S[i] is the sum of the weights of edges going out of node i for i in range(len(alphas)):
S[S != 0] = 1.0 / S[S != 0] # S[i] is now the sum of the weights of edges going into node i # reshape v to a column vector
Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) # Q is the matrix of edge weights going into each node v = v.reshape((N,1))
A = Q x[:, [i]] = v
# make v sparse
v = sp.sparse.lil_array(v)
mu = A @ v - v # column vector
v = np.repeat(1.0 / N, N) # p is the personalization vector r = sp.sparse.lil_matrix((N, len(alphas))) # residual matrix
mu = A @ v - v res = np.ones(len(alphas)) # residual norm vector
for i in range(len(alphas)): for i in range(len(alphas)):
r = alphas[i] * mu # residual vector r[:,[i]] = alphas[i] * mu # residual vector for the i-th alpha
Res = np.linalg.norm(r, 2) # residual norm # r_i = r[:,[i]].toarray()
res[i] = sp.sparse.linalg.norm(r[:,[i]]) # residual norm for the i-th alpha
if res[i] >= tol:
x[:, [i]] = r[:,[i]] + v # update the i-th column of x
mv = 1 # number of matrix-vector multiplications
for _ in range(max_iter): # starting of the while loop in the paper
# print("Iteration: ", _)
err = np.max(res)
# print("Error: ", err)
if err < tol:
print("Convergence reached with, ", mv, " matrix-vector multiplications")
return x, mv, alphas, tol
print("Iteration: ", _, "\r", end="")
if Res >= tol:
x[:, [i]] = r + v # update the i-th column of x
mv = 0 # number of matrix-vector multiplications
for _ in range(max_iter):
mv += 1
mu = A @ mu mu = A @ mu
mv += 1
for i in range(len(alphas)): for i in range(len(alphas)):
if Res >= tol: if res[i] >= tol:
r = pow(alphas[i], mv+1) * mu r[:,[i]] = np.power(alphas[i], mv+1) * mu
Res = np.linalg.norm(r,2) # r_i = r[:,[i]].toarray()
res[i] = sp.sparse.linalg.norm(r[:,[i]])
if Res >= tol:
x[:, [i]] = r + v
err = np.absolute(r).max() if res[i] >= tol:
if err < tol: x[:, [i]] = r[:,[i]] + v
return x, mv, alphas, tol
raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error raise nx.PowerIterationFailedConvergence(max_iter) # if the error is not small enough, raise an error

@ -1,257 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import networkx as nx\n",
"import time\n",
"import math\n",
"import pandas as pd\n",
"import scipy as sp\n",
"import plotly.express as px\n",
"import plotly.graph_objs as go\n",
"from scipy.sparse import *\n",
"from scipy import linalg\n",
"from scipy.sparse.linalg import norm\n",
"from scipy.optimize import least_squares"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Arnoldi \n",
"\n",
"This is a copy of the algorithm defined and tested in the notebook `algo2_testing`. It's an implementation of the Algorithm 2 from the paper. It's needed in this notebook since this function is called by the `algo4` function. It's implemented to return exactly what's needed in the `algo4` function.\n",
"\n",
"Everything will be reorganized in the main.py file once everything is working."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def Arnoldi(A,v0,m):\n",
" v = v0\n",
" beta = norm(v)\n",
" v = v/beta\n",
" H = sp.sparse.lil_matrix((m+1,m)) \n",
" V = sp.sparse.lil_matrix((A.shape[0],m+1))\n",
" V[:,0] = v # each column of V is a vector v\n",
"\n",
" for j in range(m):\n",
" w = A @ v \n",
" for i in range(j):\n",
" tmp = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory\n",
" H[i,j] = tmp[0,0] \n",
" w = w - H[i,j]*v \n",
" \n",
" H[j+1,j] = norm(w)\n",
"\n",
" if H[j+1,j] == 0:\n",
" # print(\"Arnoldi breakdown\")\n",
" m = j\n",
" v = 0\n",
" break\n",
" else:\n",
" if j < m-1:\n",
" v = w/H[j+1,j]\n",
" V[:,j+1] = v\n",
"\n",
" return V, H, v, beta, j "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Algorithm 4 testing\n",
"\n",
"This algorithm is based on the \"Algorithm 4\" of the paper, the pseudocode provided by the authors is the following \n",
"\n",
"![](https://i.imgur.com/H92fru7.png)\n",
"\n",
"Line 14 is particularly tricky to understand, not working for now. Need to figure out how to solve that linear system. My idea was to do something like that\n",
"\n",
"![](https://i.imgur.com/uBCDYUa.jpeg)\n",
"\n",
"And use the `sp.sparse.linalg.spsolve` function to solve the linear system as $Ax=0$ where $A$ is $[\\bar H_m^i ~ | ~ z]$ but it returns an array of zeros. So the idea it's wrong"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def Algo4(Pt, v, m, a: list, tau, maxit: int, x):\n",
" \n",
" # I'm using a non declared variable n here , it's declared in the next cell when I call this function. This will be fixed later in the main.py file\n",
"\n",
" iter = 1\n",
" mv = 0\n",
" I = sp.sparse.eye(n, n, format='lil')\n",
" r = sp.sparse.lil_matrix((n,1))\n",
" res = np.zeros(len(a)) \n",
"\n",
" # I'm defining 3 canonical vectors of different sizes. It's probably stupid, will be fixed once the algorithm actually works\n",
"\n",
" H_e1 = np.zeros((m+1,1)) # canonical basis vector of size H.shape[0]\n",
" H_e1[0] = 1\n",
"\n",
" V_e1 = np.zeros((n,1)) # canonical basis vector of size V.shape[0]\n",
" V_e1[0] = 1\n",
"\n",
" s_e1 = np.zeros((len(a),1)) # canonical basis vector of size s.shape[0]\n",
" s_e1[0] = 1\n",
"\n",
" def find_k(res): # function to find the index of the largest element in res\n",
" k = 0\n",
" for i in range(len(a)):\n",
" if res[i] == max(res):\n",
" k = i\n",
" break\n",
" return k\n",
"\n",
" def compute_gamma(res, a, k): # function to compute gamma\n",
" gamma = np.zeros(len(a))\n",
" for i in range(len(a)):\n",
" if i != k:\n",
" gamma[i] = (res[i]*a[k])/(res[k]*a[i])\n",
" else:\n",
" gamma[i] = 0\n",
" return gamma\n",
"\n",
" # compute the residual vector\n",
" for i in range(len(a)):\n",
" r = ((1-a[i])/a[i])*v - ((1/a[i])*I - Pt) @ x\n",
" res[i] = a[i]*norm(r)\n",
"\n",
" while max(res) >= tau and iter <= maxit:\n",
" k = find_k(res)\n",
" gamma = compute_gamma(res, a, k)\n",
" V, H, v, beta, j = Arnoldi((1/a[k])*I - Pt, r, m)\n",
"\n",
" mv = mv + j\n",
"\n",
" # compute y as the minimizer of || beta*e1 - Hy ||_2 using the least squares method\n",
" y = sp.sparse.linalg.lsqr(H, beta*H_e1)[0]\n",
"\n",
" # reshape y to be a column vector\n",
" y = y.reshape(y.shape[0],1)\n",
"\n",
" # update x \n",
" x += V[:,0:y.shape[0]] @ y\n",
"\n",
" # compute the residual vector\n",
" res[k] = a[k]*np.linalg.norm(beta*V_e1 - V[:,0:y.shape[0]] @ y)\n",
" \n",
" # for i in range(len(a)) but not k\n",
" for i in range(len(a)):\n",
" if i != k and res[i] >= tau:\n",
" # Compute H as described in the paper\n",
" H = H + ((1-a[i])/a[i] - (1-a[k])/a[k])*sp.sparse.eye(H.shape[0], H.shape[1], format='lil') \n",
"\n",
" z = beta*H_e1 - H @ y # define z as in the paper (page 9)\n",
" A_tmp = sp.sparse.hstack([H, z]) # stack H and z, as in the paper, to solve the linear system (?)\n",
" A_tmp = A_tmp.tocsc() # Convert A to CSC format for sparse solver\n",
"\n",
" # What should I put here? What does it mean in the paper the line 14 of the pseudocode?\n",
" result = sp.sparse.linalg.spsolve(A_tmp, np.zeros(A_tmp.shape[0])) # if I solve this, I get a vector of zeros.\n",
" print(result)\n",
" \n",
" # I don't know if the code below is correct since I don't get how to solve the linear system above, so I'm unsure about what y and gamma should be. For now it's commented out.\n",
"\n",
" # # update x\n",
" # x += V[:,0:y.shape[0]] @ y\n",
" # # update the residual vector\n",
" # res[i] = (a[i]/a[k])*gamma[k]*res[k] \n",
"\n",
" iter = iter + 1\n",
"\n",
" return x, iter, mv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Basic test case with random numbers to test the algorithm."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n = 100\n",
"m = 110\n",
"maxit = 100\n",
"tau = 1e-6\n",
"a = [0.85, 0.9, 0.95, 0.99]\n",
"\n",
"x = sp.sparse.lil_matrix((n,1))\n",
"x[0,0] = 1\n",
"\n",
"# generate a random graph\n",
"G = nx.gnp_random_graph(n, 0.1, seed=1, directed=True)\n",
"\n",
"P = sp.sparse.lil_matrix((n,n))\n",
"for i in G.nodes():\n",
" for j in G[i]: #G[i] is the list of nodes connected to i, it's neighbors\n",
" P[i-1,j-1] = 1/len(G[i])\n",
"\n",
"# generate a probability vector, with all the entries as 1/n\n",
"v = sp.sparse.lil_matrix((n,1))\n",
"for i in range(n):\n",
" v[i] = 1/n\n",
"\n",
"# dangling nodes vector\n",
"d = sp.sparse.lil_matrix((n,1))\n",
"for i in range(n):\n",
" if P[i].sum() == 0:\n",
" d[i] = 1\n",
"\n",
"# compute the transition matrix\n",
"Pt = P + v @ (d.T)\n",
"\n",
"# run the algorithm\n",
"Algo4(Pt, v, m, a, tau, maxit, x)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.6 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,82 @@
#! /usr/bin/python
from algo import *
import warnings
import argparse
warnings.filterwarnings("ignore")
# df = pd.DataFrame(columns=["method" "alpha", "cpu_time", "mv", "tol"])
def run_standard_pagerank(G, alphas):
print("\nStarting the standard pagerank algorithm...\n")
iter_dict = dict.fromkeys(alphas, 0)
list_of_pageranks = []
start1 = time.time()
for alpha in alphas:
x, iter, tol = pagerank(G, alpha, tol=1e-6)
iter_dict[alpha] = iter
list_of_pageranks.append(x)
end1 = time.time()
total_iter = sum(iter_dict.values())
cpu_time = round(end1 - start1,1)
mv = total_iter
print("\nSTANDARD PAGERANK ALGORITHM\n")
print("\tCPU time (s):", cpu_time)
print("\tMatrix-vector multiplications:", mv)
print("\tAlpha:", alphas)
print("\tTolerance:", tol)
for i in range(len(list_of_pageranks)):
if not list_of_pageranks[i]:
print("The algorithm did not converge for alpha =", alphas[i])
# df.loc[len(df)] = ["Power Method", alphas, cpu_time, mv, tol]
# df.to_csv(args.dataset + "_results.tsv", sep="\t", index=False)
# print("\nThe results are saved in the file:", args.dataset + "_results.tsv")
def run_shifted_powe(G, alphas):
print("\nStarting the shifted pagerank algorithm...\n")
start2 = time.time()
x, mv, alphas, tol = shifted_pow_pagerank(G, alphas, tol=1e-6)
end2 = time.time()
cpu_time = round(end2 - start2,1)
print("\nSHIFTED PAGERANK ALGORITHM\n")
print("\tCPU time (s):", cpu_time)
print("\tMatrix-vector multiplications:", mv)
print("\tAlphas:", alphas)
print("\tTolerance:", tol)
# df.loc[len(df)] = ["Shifted Power Method", alphas, cpu_time, mv, tol]
# df.to_csv(args.dataset + "_results.tsv", sep="\t", index=False)
# print("\nThe results are saved in the file:", args.dataset + "_results.tsv")
# main function
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, default="Stanford", help="Choose the dataset to work with. The options are: Stanford, NotreDame, BerkStan")
parser.add_argument("--algo", type=str, default="both", help="Choose the algorithm to use. The options are: pagerank, shifted_pagerank, both")
args = parser.parse_args()
G = load_data(args.dataset)
alphas = [0.85, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
if args.algo == "pagerank":
run_standard_pagerank(G, alphas)
elif args.algo == "shifted_pagerank":
run_shifted_powe(G, alphas)
elif args.algo == "both":
run_standard_pagerank(G, alphas)
run_shifted_powe(G, alphas)

@ -86,21 +86,19 @@
"source": [ "source": [
"def Arnoldi(A,v0,m):\n", "def Arnoldi(A,v0,m):\n",
" v = v0\n", " v = v0\n",
" beta = norm(v)\n", " beta = np.linalg.norm(v)\n",
" v = v/beta\n", " v = v/beta\n",
" H = sp.sparse.lil_matrix((m+1,m)) \n", " H = sp.sparse.lil_matrix((m+1,m)) \n",
" V = sp.sparse.lil_matrix((A.shape[0],m+1))\n", " V = sp.sparse.lil_matrix((A.shape[0],m+1))\n",
" V[:,0] = v # each column of V is a vector v\n", " V[:,0] = v # each column of V is a vector v\n",
"\n", "\n",
" for j in range(m):\n", " for j in range(m):\n",
" # print(\"j = \", j)\n",
" w = A @ v \n", " w = A @ v \n",
" for i in range(j):\n", " for i in range(j):\n",
" tmp = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory\n", " H[i,j] = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory\n",
" H[i,j] = tmp[0,0] \n",
" w = w - H[i,j]*v \n", " w = w - H[i,j]*v \n",
" \n", " \n",
" H[j+1,j] = norm(w)\n", " H[j+1,j] = np.linalg.norm(w)\n",
"\n", "\n",
" if H[j+1,j] == 0:\n", " if H[j+1,j] == 0:\n",
" print(\"Arnoldi breakdown\")\n", " print(\"Arnoldi breakdown\")\n",
@ -118,8 +116,7 @@
" print(\"v = \", v.shape)\n", " print(\"v = \", v.shape)\n",
" print(\"beta = \", beta)\n", " print(\"beta = \", beta)\n",
"\n", "\n",
" return V, H, v, beta, j \n", " return V, H, beta, j "
" "
] ]
}, },
{ {

@ -0,0 +1,868 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import networkx as nx\n",
"import time\n",
"import math\n",
"import pandas as pd\n",
"import scipy as sp\n",
"import plotly.express as px\n",
"import plotly.graph_objs as go\n",
"from scipy.sparse import *\n",
"from scipy import linalg\n",
"from scipy.sparse.linalg import norm\n",
"from scipy.optimize import least_squares"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Arnoldi \n",
"\n",
"This is a copy of the algorithm defined and tested in the notebook `algo2_testing`. It's an implementation of the Algorithm 2 from the paper. It's needed in this notebook since this function is called by the `algo4` function. It's implemented to return exactly what's needed in the `algo4` function.\n",
"\n",
"Everything will be reorganized in the main.py file once everything is working."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def Arnoldi(A,v0,m):\n",
" v = v0\n",
" beta = np.linalg.norm(v)\n",
" v = v/beta\n",
" H = sp.sparse.lil_matrix((m+1,m)) \n",
" V = sp.sparse.lil_matrix((A.shape[0],m+1))\n",
" V[:,0] = v # each column of V is a vector v\n",
"\n",
" for j in range(m):\n",
" w = A @ v \n",
" for i in range(j):\n",
" H[i,j] = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory\n",
" w = w - H[i,j]*v \n",
" \n",
" H[j+1,j] = np.linalg.norm(w)\n",
"\n",
" if H[j+1,j] == 0:\n",
" # print(\"Arnoldi breakdown\")\n",
" m = j\n",
" v = 0\n",
" break\n",
" else:\n",
" if j < m-1:\n",
" v = w/H[j+1,j]\n",
" V[:,j+1] = v\n",
"\n",
" return V, H, beta, j "
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Algorithm 4 testing\n",
"\n",
"This algorithm is based on the \"Algorithm 4\" of the paper, the pseudocode provided by the authors is the following \n",
"\n",
"![](https://i.imgur.com/H92fru7.png)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def compute_gamma(res, a, k): # function to compute gamma\n",
" gamma = np.ones(len(a))\n",
" for i in range(len(a)):\n",
" if i != k:\n",
" gamma[i] = (res[i]*a[k])/(res[k]*a[i])\n",
" else:\n",
" gamma[i] = 0\n",
" return gamma"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Basic test case with random numbers to test the algorithm."
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"def compute_ptilde(G: nx.Graph):\n",
" \"\"\"\n",
" Compute the ptilde matrix and the probability vector v\n",
" :param G: the graph\n",
" :return: the ptilde matrix and the probability vector v\n",
"\n",
" \"\"\"\n",
"\n",
" # given the graph G, return it's sparse matrix representation\n",
" A = nx.to_scipy_sparse_array(G, format='lil')\n",
"\n",
" # create a vector d (sparse), where d[i] = 1 if the i-th row of A not null, else 0\n",
" d = sp.sparse.lil_matrix((1, A.shape[0]))\n",
" for i in range(A.shape[0]):\n",
" # s is the sum of the i-th row of A\n",
" s = sp.sparse.lil_matrix.sum(A[[i],:])\n",
" if s == 0:\n",
" d[0,[i]] = 0\n",
"\n",
" # probability vector v = 1/n\n",
" v = np.repeat(1/A.shape[0], A.shape[0])\n",
"\n",
" # initialize the ptilde matrix\n",
" P = sp.sparse.lil_matrix((A.shape[0], A.shape[1]))\n",
"\n",
" # P(i,j) = 1/(number of non null entries in column j) if A(i,j) != 0, else 0\n",
" for j in range(A.shape[1]):\n",
" for i in range(A.shape[0]):\n",
" if A[i,j] != 0:\n",
" P[i,j] = 1/sp.sparse.lil_matrix.sum(A[:,[j]] != 0)\n",
"\n",
" Pt = P + v @ d.T\n",
"\n",
" return Pt, v "
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"def Algo4(Pt, v, m, a: list, tau, maxit: int, x):\n",
"\n",
" mv, iter = 0, 1 # mv is the number of matrix-vector products, iter is the number of iterations\n",
" \n",
" # initialize x as a random sparse matrix. Each col is the pagerank vector for a different alpha\n",
" x = sp.sparse.lil_matrix((Pt.shape[0], len(a)))\n",
"\n",
" # initialize the identity matrix of size Pt.shape\n",
" I = sp.sparse.eye(Pt.shape[0], Pt.shape[1], format='lil')\n",
"\n",
" # compute the residual vector, it is a matrix of size (n, len(a)). Each col is the residual vector for a different alpha. \n",
" r = sp.sparse.lil_matrix((Pt.shape[0], len(a)))\n",
" res = np.zeros(len(a))\n",
"\n",
" # compute the residual vector and the norm of each col in the vector res\n",
" for i in range(len(a)):\n",
" r[:,[i]] = sp.sparse.linalg.spsolve(I - a[i]*Pt, v)\n",
" col = r[:,[i]].toarray()\n",
" res[i] = np.linalg.norm(col)\n",
"\n",
" # this is a while loop in the paper\n",
" for _ in range(maxit):\n",
" # check if we have converged\n",
" err = np.absolute(np.max(res))\n",
" if err < tau:\n",
" print(\"Computation ended successfully in \", iter, \" iterations and \", mv, \" matrix-vector products.\")\n",
" return x, iter, mv\n",
"\n",
" print(\"\\niter = \", iter)\n",
" print(\"res: \", res)\n",
" print(\"err = \", err)\n",
"\n",
" # find k as the index of the largest residual\n",
" k = int(np.argmax(res))\n",
" print(\"k = \", k)\n",
"\n",
" # compute gamma as defined in the paper\n",
" gamma = compute_gamma(res, a, k)\n",
" \n",
" # Run Arnoldi\n",
" r_k = r[:,[k]].toarray() # r_k is the residual vector for alpha_k\n",
" A_arnoldi = (1/a[k])*I - Pt # A_arnoldi is the matrix used in Arnoldi\n",
" V, H, beta, j = Arnoldi((1/a[k])*I - Pt, r_k, m) # V is the matrix of vectors v, H is the Hessenberg matrix, beta is the norm of the last vector v, j is the number of iterations of Arnoldi\n",
" H = H[:-1,:] # remove the last row of H\n",
" V = V[:,:-1] # remove the last col of V\n",
" mv = mv + j # update the number of matrix-vector products\n",
"\n",
" H_e1 = np.zeros(H.shape[0]) \n",
" H_e1[0] = 1 # canonical vector e1 of size H.shape[0]\n",
"\n",
" # compute y as the minimizer of || beta*e1 - Hy ||_2 using the least squares method\n",
" y = sp.sparse.lil_matrix((H.shape[1],len(a))) # y is the matrix of vectors y, each col is the vector y for a different alpha\n",
"\n",
" # we only need the k-th col of y in this iteration\n",
" y[:,[k]] = sp.sparse.linalg.lsqr(H, beta*H_e1)[0]\n",
" y_k = y[:,[k]].toarray()\n",
"\n",
" # # Update x\n",
" x_new = x\n",
" x_new[:,[k]] = x[:,[k]] + V @ y_k\n",
"\n",
" V_e1 = np.zeros(V.shape[0])\n",
" V_e1[0] = 1 # canonical vector e1 of size V.shape[0]\n",
"\n",
" # Update res[k]\n",
" norm_k =np.linalg.norm(beta*V_e1 - V @ y_k) # this returns a scalar\n",
" res[k] = a[k]*norm_k\n",
"\n",
" # multi shift\n",
" for i in range(len(a)):\n",
" if i != k and res[i] >= tau:\n",
" if res[i] >= tau:\n",
" \n",
" H = H + ((1-a[i])/a[i] - (1-a[k])/a[k])*sp.sparse.eye(H.shape[0], H.shape[1], format='lil')\n",
"\n",
" # Compute z as described in the paper\n",
" z1 = H_e1*beta\n",
" z1 = z1.reshape(z1.shape[0],1)\n",
" z2 = H @ y[:,[1]]\n",
" z2 = z2.reshape(z2.shape[0],1)\n",
" z = z1 - z2\n",
"\n",
" # Solve the linear system for A and b\n",
" A = sp.sparse.hstack([H, z])\n",
" b = (beta*H_e1)\n",
"\n",
" # use the least squares method to solve the linear system\n",
" to_split = sp.sparse.linalg.lsqr(A, b.reshape(b.shape[0],1))[0]\n",
" \n",
" # the last element of to_split is the last element of gamma[i], the other elements are the elements of y[:[i]]\n",
" y[:,[i]] = to_split[:-1]\n",
" gamma[i] = to_split[-1]\n",
"\n",
" # update x\n",
" x_new[:,i] = x[:,i] + V @ y[:,[i]]\n",
"\n",
" # update the residual vector\n",
" # print(\"\\tupdating res[\", i, \"]\")\n",
" # print(\"\\tgamma[\", i, \"] = \", gamma[i])\n",
" # print(\"\\tres[\", k, \"] = \", res[k])\n",
" # print(\"\\ta[\", i, \"] = \", a[i])\n",
" # print(\"\\ta[\", k, \"] = \", a[k])\n",
" res[i] = (a[i]/a[k])*gamma[i]*res[k]\n",
" # print(\"\\tupdated res[\", i, \"] = \", res[i])\n",
" # print()\n",
"\n",
" else:\n",
" if res[i] < tau:\n",
" print(\"res[\", i, \"] is smaller than tau = \", tau, \" at iteration \", iter)\n",
"\n",
" iter = iter + 1\n",
" x = x_new\n",
"\n",
" raise Exception('Maximum number of iterations reached')"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"G = nx.watts_strogatz_graph(1000, 4, 0.1)\n",
"Pt, v = compute_ptilde(G)\n",
"# a = [0.85, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]\n",
"a = [0.85, 0.86, 0.87, 0.88]\n",
"tau = 1e-6\n",
"maxit = 100\n",
"n = len(G.nodes)\n",
"x = sp.sparse.random(n, len(a), density=0.1, format='lil')"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/lib/python3.10/site-packages/scipy/sparse/linalg/_dsolve/linsolve.py:168: SparseEfficiencyWarning: spsolve requires A be CSC or CSR matrix format\n",
" warn('spsolve requires A be CSC or CSR matrix format',\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"iter = 1\n",
"res: [0.21235414 0.22756312 0.24511308 0.26558937]\n",
"err = 0.26558937251088227\n",
"k = 3\n",
"\n",
"\n",
"iter = 2\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 3\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 4\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 5\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 6\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 7\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 8\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 9\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 10\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 11\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 12\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 13\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 14\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 15\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 16\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 17\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 18\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 19\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 20\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 21\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 22\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 23\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 24\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 25\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 26\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 27\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 28\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 29\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 30\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 31\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 32\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 33\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 34\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 35\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 36\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 37\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 38\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 39\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 40\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 41\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 42\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 43\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 44\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 45\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 46\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 47\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 48\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 49\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 50\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 51\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 52\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 53\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 54\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 55\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 56\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 57\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 58\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 59\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 60\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 61\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 62\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 63\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 64\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 65\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 66\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 67\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 68\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 69\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 70\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 71\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 72\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 73\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 74\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 75\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 76\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 77\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 78\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 79\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 80\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 81\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 82\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 83\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 84\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 85\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n",
"\n",
"\n",
"iter = 86\n",
"res: [ 0.81703891 0.8829876 0.10190349 10.69433448]\n",
"err = 10.69433447757171\n",
"k = 3\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[87], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m x, \u001b[38;5;28miter\u001b[39m, mv \u001b[38;5;241m=\u001b[39m \u001b[43mAlgo4\u001b[49m\u001b[43m(\u001b[49m\u001b[43mPt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtau\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmaxit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n",
"Cell \u001b[0;32mIn[85], line 46\u001b[0m, in \u001b[0;36mAlgo4\u001b[0;34m(Pt, v, m, a, tau, maxit, x)\u001b[0m\n\u001b[1;32m 44\u001b[0m V, H, beta, j \u001b[38;5;241m=\u001b[39m Arnoldi((\u001b[38;5;241m1\u001b[39m\u001b[38;5;241m/\u001b[39ma[k])\u001b[38;5;241m*\u001b[39mI \u001b[38;5;241m-\u001b[39m Pt, r_k, m) \u001b[38;5;66;03m# V is the matrix of vectors v, H is the Hessenberg matrix, beta is the norm of the last vector v, j is the number of iterations of Arnoldi\u001b[39;00m\n\u001b[1;32m 45\u001b[0m H \u001b[38;5;241m=\u001b[39m H[:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m,:] \u001b[38;5;66;03m# remove the last row of H\u001b[39;00m\n\u001b[0;32m---> 46\u001b[0m V \u001b[38;5;241m=\u001b[39m \u001b[43mV\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;66;03m# remove the last col of V\u001b[39;00m\n\u001b[1;32m 47\u001b[0m mv \u001b[38;5;241m=\u001b[39m mv \u001b[38;5;241m+\u001b[39m j \u001b[38;5;66;03m# update the number of matrix-vector products\u001b[39;00m\n\u001b[1;32m 49\u001b[0m H_e1 \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros(H\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]) \n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/scipy/sparse/_lil.py:211\u001b[0m, in \u001b[0;36mlil_matrix.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_intXint(\u001b[39m*\u001b[39mkey)\n\u001b[1;32m 210\u001b[0m \u001b[39m# Everything else takes the normal path.\u001b[39;00m\n\u001b[0;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m IndexMixin\u001b[39m.\u001b[39;49m\u001b[39m__getitem__\u001b[39;49m(\u001b[39mself\u001b[39;49m, key)\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/scipy/sparse/_index.py:69\u001b[0m, in \u001b[0;36mIndexMixin.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[39mif\u001b[39;00m row \u001b[39m==\u001b[39m \u001b[39mslice\u001b[39m(\u001b[39mNone\u001b[39;00m) \u001b[39mand\u001b[39;00m row \u001b[39m==\u001b[39m col:\n\u001b[1;32m 68\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcopy()\n\u001b[0;32m---> 69\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_sliceXslice(row, col)\n\u001b[1;32m 70\u001b[0m \u001b[39melif\u001b[39;00m col\u001b[39m.\u001b[39mndim \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m 71\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_sliceXarray(row, col)\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/scipy/sparse/_lil.py:241\u001b[0m, in \u001b[0;36mlil_matrix._get_sliceXslice\u001b[0;34m(self, row, col)\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_get_sliceXslice\u001b[39m(\u001b[39mself\u001b[39m, row, col):\n\u001b[1;32m 240\u001b[0m row \u001b[39m=\u001b[39m \u001b[39mrange\u001b[39m(\u001b[39m*\u001b[39mrow\u001b[39m.\u001b[39mindices(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m]))\n\u001b[0;32m--> 241\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_row_ranges(row, col)\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/scipy/sparse/_lil.py:290\u001b[0m, in \u001b[0;36mlil_matrix._get_row_ranges\u001b[0;34m(self, rows, col_slice)\u001b[0m\n\u001b[1;32m 287\u001b[0m nj \u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(col_range)\n\u001b[1;32m 288\u001b[0m new \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_lil_container((\u001b[39mlen\u001b[39m(rows), nj), dtype\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdtype)\n\u001b[0;32m--> 290\u001b[0m _csparsetools\u001b[39m.\u001b[39;49mlil_get_row_ranges(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mshape[\u001b[39m0\u001b[39;49m], \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mshape[\u001b[39m1\u001b[39;49m],\n\u001b[1;32m 291\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrows, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdata,\n\u001b[1;32m 292\u001b[0m new\u001b[39m.\u001b[39;49mrows, new\u001b[39m.\u001b[39;49mdata,\n\u001b[1;32m 293\u001b[0m rows,\n\u001b[1;32m 294\u001b[0m j_start, j_stop, j_stride, nj)\n\u001b[1;32m 296\u001b[0m \u001b[39mreturn\u001b[39;00m new\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"x, iter, mv = Algo4(Pt, v, 100, a, tau, maxit, x)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.8 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

53
tex/intro.tex vendored

@ -0,0 +1,53 @@
\section{Introduction}
The PageRank model was proposed by Google in a series of papers to evaluate accurately the most important web-pages from the World Wide Web matching a set of keywords entered by a user. For search engine rankings, the importance of web-pages is computed from the stationary probability vector of the random process of a web surfer who keeps visiting a large set of web-pages connected by hyperlinks. The link structure of the World Wide Web is represented by a directed graph, the so-called web link graph, and its corresponding adjacency matrix $G \in \N^{n \times n}$ where $n$ denotes the number of pages and $G_{ij}$ is nonzero (being 1) only if the \emph{jth} page has a hyperlink pointing to the \emph{ith} page. The transition probability matrix $P \in \R^{n \times n}$ of the random process has entries as described in \ref{eq:transition}.
\begin{equation}\label{eq:transition}
P(i,j) =
\begin{cases}
\displaystyle \frac{1}{\sum_{k=1}^n G_{kj}} & \text{if } G_{i,j} = 0 \\
0 & \text{otherwise}
\end{cases}
\end{equation}
\noindent The entire random process needs a unique stationary distribution. To ensure this propriety is satisfied , the transition matrix $P$ is usually modified to be an irreducible stochastic matrix $A$ (called the Google matrix) as follows:
% \noindent To ensure that the random process has a unique stationary distribution and it will not stagnate, the transition matrix P is usually modified to be an irreducible stochastic matrix $A$ (called the Google matrix) as follows
\begin{equation}\label{eq:google}
A = \alpha \tilde P + (1 - \alpha)v e^T
\end{equation}
\noindent In \ref{eq:google} we have defined a new matrix called $\tilde P = P + vd^T$ where $d \in N^{n \times 1}$ is a binary vector tracing the indices of the damping web pages with no hyperlinks, i.e., $d(i) = 1$ if the \emph{i-th} page has no hyperlink, $v \in \R^{n \times n}$ is a probability vector, $e = [1, 1, ... ,1]^T$ and $0<\alpha<1$, the so-called damping factor that represents the probability in the model that the surfer transfer by clicking a hyperlink rather than other ways. Mathematically, the PageRank model can be formulated as the problem of finding the positive unit eigenvector $x$ (the so-called PageRank vector) such that
\begin{equation}\label{eq:pr}
Ax = x, \quad \lVert x \rVert = 1, \quad x > 0
\end{equation}
or, equivalently, as the solution of the linear system
\begin{equation}\label{eq:pr2}
(I - \alpha \tilde P)x = (1 - \alpha)v
\end{equation}
\noindent The authors of the paper \cite{SHEN2022126799} emphasize how in the in the past decade or so, considerable research attention has been devoted to the efficient solution of problems \ref{eq:pr} \ref{eq:pr2}, especially when $n$ is very large. For moderate values of the damping factor, e.g. for $\alpha = 0.85$ as initially suggested by Google for search engine rankings, solution strategies based on the simple Power method have proved to be very effective. However, when $\alpha$ approaches 1, as is required in some applications, the convergence rates of classical stationary iterative methods including the Power method tend to deteriorate sharply, and more robust algorithms need to be used. \vspace*{0.4cm}
\noindent In the reference paper that we are using for this project, the authors focus their attention in the area of PageRank computations with the same network structure but multiple damping factors. For example, in the Random Alpha PageRank model used in the design of anti-spam mechanism \cite{Constantine2009Random}, the rankings corresponding to many different damping factors close to 1 need to be computed simultaneously. They explain that the problem can be expressed mathematically as solving a sequence of linear systems
\begin{equation}\label{eq:pr3}
(I - \alpha_i \tilde P)x_i = (1 - \alpha_i)v \quad \alpha_i \in (0, 1) \quad \forall i \in \{1, 2, ..., s\}
\end{equation}
As we know, standard PageRank algorithms applied to \ref{eq:pr3} would solve the $s$ linear systems independently. Although these solutions can be performed in parallel, the process would still demand large computational resources for high dimension problems.
This consideration motived the authors to search novel methods with reduced algorithmic and memory complexity, to afford the solution of larger problems on moderate computing resources. They suggest to write the PageRank problem with multiple damping factors given at once \ref{eq:pr3} as a sequence of shifted linear systems of the form:
\begin{equation}
\Big(\frac{1}{\alpha_i}I - \tilde P \Big)x^{(i)} = \frac{1 - \alpha_i}{\alpha_i}v \quad \forall i \in \{1, 2, ..., s\} \quad 0 < \alpha_i < 1
\end{equation}
We know from literature that the Shifted Krylov methods may still suffer from slow convergence when the damping factor approaches 1, requiring larger search spaces to converge with satisfactory speed. In \cite{SHEN2022126799} is suggest that, to overcome this problem, we can combine stationary iterative methods and shifted Krylov subspace methods. They derive an implementation of the Power method that solves the PageRank problem with multiple dumpling factors at almost the same computational time of the standard Power method for solving one single system. They also demonstrate that this shifted Power method generates collinear residual vectors. Based on this result, they use the shifted Power iterations to provide smooth initial solutions for running shifted Krylov subspace methods such as \texttt{GMRES}. Besides, they discuss how to apply seed system choosing strategy and extrapolation techniques to further speed up the iterative process.
% As an attempt of a possible remedy in this situation, we present a framework that combines. shifted stationary iterative methods and shifted Krylov subspace methods. In detail, we derive the implementation of the Power method that solves the PageRank problem with multiple damping factors at almost the same computational cost of the standard Power method for solving one single system. Furthermore, we demonstrate that this shifted Power method generates collinear residual vectors. Based on this result, we use the shifted Power iterations to provide smooth initial solutions for running shifted Krylov subspace methods such as GMRES. Besides, we discuss how to apply seed system choosing strategy and extrapolation techniques to further speed up the iterative process.
\subsection{Overview of the classical PageRank problem}
The Power method is considered one of the algorithms of choice for solving either the eigenvalue \ref{eq:pr} or the linear system \ref{eq:pr2} formulation of the PageRank problem, as it was originally used by Google. Power iterations write as
\begin{equation}\label{eq:power}
x_{(k+1)} = Ax_k =\alpha \tilde P x_{(k)} + (1 - \alpha)v
\end{equation}
The convergence behavior is determined mainly by the ratio between the two largest eigenvalues of A. When $\alpha$ gets closer to $1$, though, the convergence can slow down significantly. \\
\noindent As stated in \cite{SHEN2022126799} The number of iterations required to reduce the initial residual down to a tolerance $\tau$, measured as $\tau = \lVert Ax_k - x_k \rVert = \lVert x_{k+1} - x_k \rVert$ can be estimated as $\frac{\log_{10} \tau}{\log_{10} \alpha}$. The authors provide an example: when $\tau = 10^{-8}$ the Power method requires about 175 steps to converge for $\alpha = 0.9$ but the iteration count rapidly grows to 1833 for $\alpha = 0.99$. Therefore, for values of the damping parameter very close to 1 more robust alternatives to the simple Power algorithm should be used.

Binary file not shown.

170
tex/main.tex vendored

@ -1,5 +1,5 @@
\documentclass[12pt]{article} \documentclass[11pt]{article}
\usepackage[margin=1in]{geometry} \usepackage[margin=1.2in]{geometry}
\usepackage[utf8]{inputenc} \usepackage[utf8]{inputenc}
\usepackage[english]{babel} \usepackage[english]{babel}
\usepackage[T1]{fontenc} \usepackage[T1]{fontenc}
@ -17,6 +17,21 @@
\usepackage{lipsum} \usepackage{lipsum}
\usepackage{algorithm} \usepackage{algorithm}
\usepackage{algpseudocode} \usepackage{algpseudocode}
\usepackage{mathtools}
\usepackage{nccmath}
\usepackage[most]{tcolorbox}
\newtcolorbox[auto counter]{problem}[1][]{%
enhanced,
breakable,
colback=white,
colbacktitle=white,
coltitle=black,
fonttitle=\bfseries,
boxrule=.6pt,
titlerule=.2pt,
toptitle=3pt,
bottomtitle=3pt,
title=GitHub repository of this project}
\newcommand{\R}{\mathbb{R}} \newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}} \newcommand{\N}{\mathbb{N}}
@ -42,153 +57,16 @@ for solving PageRank with multiple damping factors}
\tableofcontents \tableofcontents
\clearpage \clearpage
\section{Introduction} \include{intro.tex}
\include{shifted.tex}
The PageRank model was proposed by Google in a series of papers to evaluate accurately the most important web-pages from the World Wide Web matching a set of keywords entered by a user. For search engine rankings, the importance of web-pages is computed from the stationary probability vector of the random process of a web surfer who keeps visiting a large set of web-pages connected by hyperlinks. The link structure of the World Wide Web is represented by a directed graph, the so-called web link graph, and its corresponding adjacency matrix $G \in \N^{n \times n}$ where $n$ denotes the number of pages and $G_{ij}$ is nonzero (being 1) only if the \emph{jth} page has a hyperlink pointing to the \emph{ith} page. The transition probability matrix $P \in \R^{n \times n}$ of the random process has entries as described in \ref{eq:transition}. \include{shift_GMRES.tex}
\include{num.tex}
\begin{equation}\label{eq:transition}
P(i,j) =
\begin{cases}
\displaystyle \frac{1}{\sum_{k=1}^n G_{kj}} & \text{if } G_{i,j} = 0 \\
0 & \text{otherwise}
\end{cases}
\end{equation}
\noindent The entire random process needs a unique stationary distribution. To ensure this propriety is satisfied , the transition matrix $P$ is usually modified to be an irreducible stochastic matrix $A$ (called the Google matrix) as follows:
% \noindent To ensure that the random process has a unique stationary distribution and it will not stagnate, the transition matrix P is usually modified to be an irreducible stochastic matrix $A$ (called the Google matrix) as follows
\begin{equation}\label{eq:google}
A = \alpha \tilde P + (1 - \alpha)v e^T
\end{equation}
\noindent In \ref{eq:google} we have defines a new matrix called $\tilde P = P + vd^T$ where $d \in N^{n \times 1}$ is a binary vector tracing the indices of the damping web pages with no hyperlinks, i.e., $d(i) = 1$ if the \emph{i-th} page ha no hyperlink, $v \in \R^{n \times n}$ is a probability vector, $e = [1, 1, ... ,1]^T$ and $0<\alpha<1$, the so-called damping factor that represents the probability in the model that the surfer transfer by clicking a hyperlink rather than other ways. Mathematically, the PageRank model can be formulated as the problem of finding the positive unit eigenvector $x$ (the so-called PageRank vector) such that
\begin{equation}\label{eq:pr}
Ax = x, \quad \lVert x \rVert = 1, \quad x > 0
\end{equation}
or, equivalently, as the solution of the linear system
\begin{equation}\label{eq:pr2}
(I - \alpha \tilde P)x = (1 - \alpha)v
\end{equation}
\noindent The authors of the paper \cite{SHEN2022126799} emphasize how in the in the past decade or so, considerable research attention has been devoted to the efficient solution of problems \ref{eq:pr} \ref{eq:pr2}, especially when $n$ is very large. For moderate values of the damping factor, e.g. for $\alpha = 0.85$ as initially suggested by Google for search engine rankings, solution strategies based on the simple Power method have proved to be very effective. However, when $\alpha$ approaches 1, as is required in some applications, the convergence rates of classical stationary iterative methods including the Power method tend to deteriorate sharply, and more robust algorithms need to be used. \vspace*{0.4cm}
\noindent In the reference paper that we are using for this project, the authors focus their attention in the area of PageRank computations with the same network structure but multiple damping factors. For example, in the Random Alpha PageRank model used in the design of anti-spam mechanism \cite{Constantine2009Random}, the rankings corresponding to many different damping factors close to 1 need to be computed simultaneously. They explain that the problem can be expressed mathematically as solving a sequence of linear systems
\begin{equation}\label{eq:pr3}
(I - \alpha_i \tilde P)x_i = (1 - \alpha_i)v \quad \alpha_i \in (0, 1) \quad \forall i \in \{1, 2, ..., s\}
\end{equation}
As we know, standard PageRank algorithms applied to \ref{eq:pr3} would solve the $s$ linear systems independently. Although these solutions can be performed in parallel, the process would still demand large computational resources for high dimension problems.
This consideration motived the authors to search novel methods with reduced algorithmic and memory complexity, to afford the solution of larger problems on moderate computing resources. They suggest to write the PageRank problem with multiple damping factors given at once \ref{eq:pr3} as a sequence of shifted linear systems of the form:
\begin{equation}
(\frac{1}{\alpha_i}I - \tilde P)x^{(i)} = \frac{1 - \alpha_i}{\alpha_i}v \quad \forall i \in \{1, 2, ..., s\} \quad 0 < \alpha_i < 1
\end{equation}
We know from literature that the Shifted Krylov methods may still suffer from slow convergence when the damping factor approaches 1, requiring larger search spaces to converge with satisfactory speed. In \cite{SHEN2022126799} is suggest that, to overcome this problem, we can combine stationary iterative methods and shifted Krylov subspace methods. They derive an implementation of the Power method that solves the PageRank problem with multiple dumpling factors at almost the same computational time of the standard Power method for solving one single system. They also demonstrate that this shifted Power method generates collinear residual vectors. Based on this result, they use the shifted Power iterations to provide smooth initial solutions for running shifted Krylov subspace methods such as \texttt{GMRES}. Besides, they discuss how to apply seed system choosing strategy and extrapolation techniques to further speed up the iterative process.
% As an attempt of a possible remedy in this situation, we present a framework that combines. shifted stationary iterative methods and shifted Krylov subspace methods. In detail, we derive the implementation of the Power method that solves the PageRank problem with multiple damping factors at almost the same computational cost of the standard Power method for solving one single system. Furthermore, we demonstrate that this shifted Power method generates collinear residual vectors. Based on this result, we use the shifted Power iterations to provide smooth initial solutions for running shifted Krylov subspace methods such as GMRES. Besides, we discuss how to apply seed system choosing strategy and extrapolation techniques to further speed up the iterative process.
\subsection{Overview of the classical PageRank problem}
The Power method is considered one of the algorithms of choice for solving either the eigenvalue \ref{eq:pr} or the linear system \ref{eq:pr2} formulation of the PageRank problem, as it was originally used by Google. Power iterations write as
\begin{equation}\label{eq:power}
x_{(k+1)} = Ax_k =\alpha \tilde P x_{(k)} + (1 - \alpha)v
\end{equation}
The convergence behavior is determined mainly by the ratio between the two largest eigenvalues of A. When $\alpha$ gets closer to $1$, though, the convergence can slow down significantly. \\
\noindent As stated in \cite{SHEN2022126799} The number of iterations required to reduce the initial residual down to a tolerance $\tau$, measured as $\tau = \lVert Ax_k - x_k \rVert = \lVert x_{k+1} - x_k \rVert$ can be estimated as $\frac{\log_{10} \tau}{\log_{10} \alpha}$. The authors provide an example: when $\tau = 10^{-8}$ the Power method requires about 175 steps to converge for $\alpha = 0.9$ but the iteration count rapidly grows to 1833 for $\alpha = 0.99$. Therefore, for values of the damping parameter very close to 1 more robust alternatives to the simple Power algorithm should be used.
\clearpage
\section{The shifted power method for PageRank computations}
In this section presents the extensions of stationary iterative methods for the solution of PageRank problems with multiple damping factors, as presented in \cite{SHEN2022126799}. We are interested in knowing if, for each method, there exists an implementation such that the computational cost of solving the PageRank problem with multiple damping factor is comparable to that of solving the ordinary PageRank problem with single damping factor.
\subsection{The implementation of the shifted power method}
Inspired by the reason why shifted Krylov subspaces can save computational cost, the authors of \cite{SHEN2022126799} investigate whether there are duplications in the calculations of multiple linear systems in this problem class by the stationary iterative methods, so that the duplications in the computation can be deleted and used for all systems. It's some sort of dynamic programming approach. Firstly, they analyze the Power method applied to the sequence of linear systems in \ref{eq:pr2}. It computes at the \emph{i-th} iteration approximate solutions $x_k (i) (1 \leq i \leq s)$ of the form
\begin{equation}
\alpha_i^k \tilde P^k x_k^{(i)} + (1 - \alpha_i^k) \sum_{j=0}^{k-1} \alpha_i^j \tilde P^j v
\end{equation}
If the $s$ systems in \ref{eq:pr2} are solved synchronously, that is all $x^{(i)}_k$ are computed only after all previous approximations $x^{(j)}_{k-1}$ are available, then the computation can be rearranged efficiently as follows:
\begin{itemize}
\item at the first iterations
\begin{itemize}
\item compute and store $\mu_1 = \tilde P x_0$ and $\mu_2 = v$;
\item compute and store $x_1^{(i)} = \alpha_i \mu_1 + (1-\alpha_i)\mu_2;$
\end{itemize}
\item at any other subsequent iteration $k>1$
\begin{itemize}
\item compute and store $x_k^(i) := (1-\alpha_i)\sum_{j=0}^{k-2} \alpha_i^j \tilde P^j v= x_{k-1}^{(i)} - \alpha_i^{k-1} \mu_1$;
\item compute and store $\mu_1 = \tilde P \mu_1$ and $\mu_2 = \tilde P \mu_2$;
\item compute and store $x_k^{(i)} = \alpha_i \mu_1 + x_k^{(i)} + (1-\alpha_i)\alpha^{k-1}_i \mu_2$.
\end{itemize}
\end{itemize}
This implementation requires at most $2$ matrix-vector products at each step, which is a significant gain compared to the $s$ matrix-vector products required by the standard Power method to compute $x^{(i)}_{k+1}$ , especially when $s \gg 2$. \vspace{0.4cm}
\noindent An efficient implementation can compute and store $\mu = \tilde Pv -v$ at the first iteration and store
$$\mu = \tilde P^{k-1}(\tilde P v - v) = \tilde P \cdot (\tilde P^{k-2}(\tilde P v - v))$$
At each \emph{k-th} iteration ($k > 1$), and finally from each approximate solution as $x_k^{(i)} = \alpha_i^k \mu + x_{k-1}^{(i)}$. The residual vector $r_k^{(i)}$ associated with the approximate solution $x_k^{(i)}$ has the following expression
\begin{equation}
r_k^{(i)} = A x_k^{(i)} - x_k^{(i)} = x_{k+1}^{(i)} - x_k^{(i)} = \alpha_i^{k+1} \tilde P^k (\tilde P v - v)
\end{equation}
Since in general each of the $s$ linear systems may require a different number of Power iterations to converge, the $s$ residual norms have to be monitored separately to test the convergence. \vspace{0.4cm}
\noindent Now we can summarize the efficient implementation of the Power method presented in this section for solving problem \ref{eq:pr2} in Algorithm \ref{alg:algo1}, and we call it the shifted Power method hereafter.
\begin{algorithm}\label{alg:algo1}
\caption{Shifted-Power method for PageRank with multiple damping factors}\label{alg:algo1}
\begin{algorithmic}
\Require $\tilde P, ~v, ~\tau, ~\max_{mv}, ~\alpha_i ~ (1 \leq i \leq s)$
\Ensure $mv,~ x^{(i)},~ r^{(i)} ~ (1 \leq i \leq s)$
\State Compute $\mu = \tilde P v - v$
\State Set $mv =1$
\For {$i = 1:s$}
\State Compute $r^{(i)} = \alpha_i \mu$
\State Compute $Res(i) = \lVert r^{(i)} \rVert$
\If {$Res(i) \geq \tau$}
\State Compute $x^{(i)} = r^{(i)} + v$
\EndIf
\EndFor
\While {$\max(Res \geq \tau)$ and $ mv \leq \max_{mv}$}
\State compute $\mu = \tilde P \mu$
\State $mv = mv + 1$
\For {$i = 1:s$}
\If {$Res(i) \geq \tau$}
\State Compute $r^{(i)} = \alpha_i^{k+1} \mu$
\State Compute $Res(i) = \lVert r^{(i)} \rVert$
\If {$Res(i) \geq \tau$}
\State Compute $x^{(i)} = r^{(i)} + x^{(i)}$
\EndIf
\EndIf
\EndFor
\EndWhile
\end{algorithmic}
\end{algorithm}
\noindent Where $mv$ is an integer that counts the number of matrix-vector products performed by the algorithm. The algorithm stops when either all the residual norms are smaller than the tolerance $\tau$ or the maximum number of matrix-vector products is reached. An implementation of this algorithm written in Python is available in the github repository \cite{ShfitedPowGMRES} of this project.
\clearpage \clearpage
\bibliographystyle{unsrt} \bibliographystyle{unsrt}
\bibliography{ref} \bibliography{ref}
\nocite{*} \nocite{*}
\end{document} \end{document}

81
tex/num.tex vendored

@ -0,0 +1,81 @@
\section{Numerical experiments}\label{sec:exp}
In this experiment, we test the performance of the shifted Power method against the conventional Power method for solving PageRank problems with multiple damping factors, namely $\{ \alpha_1 = 0.85, ~\alpha_2 = 0.86, ~...~ ,~ \alpha_{15} = 0.99 \}$ on the \texttt{web-stanford} and \texttt{web-BerkStan} datasets. The \texttt{web-stanford} dataset is a directed graph with $|V| = 281,903$ nodes and $|E| = 1,810,314$ edges, and the \texttt{web-BerkStan} dataset is a directed graph with $|V| = 1, 013, 320$ nodes and $|E| = 5, 308, 054$ edges. The datasets are available at \url{http://snap.stanford.edu/data/web-Stanford.html} and \url{http://snap.stanford.edu/data/web-BerkStan.html} respectively. The datasets are stored in the \texttt{.txt} edge-list format. The characteristics of the datasets are summarized in Table \ref{tab:datasets}.
% create a table with cols: Name, Number of Nodes, Number of edges, Density, Average Number of zeros (per row)
\begin{table}[h]
\centering
\begin{tabular}{|c|c|c|c|}
\hline
\textbf{Dataset} & \textbf{Nodes} & \textbf{Edges} & \textbf{Density} \\ \hline
\texttt{web-Stanford} & $281,903$ & $2,312,497$ & $2.9099 \times 10^{-5}$ \\ \hline
\texttt{web-BerkStan} & $685,230$ & $7,600,595$ & $1.6187 \times 10^{-5}$ \\ \hline
\end{tabular}
\caption{Summary of the datasets used in the experiments.}
\label{tab:datasets}
\end{table}
\noindent The personalization vector $v$ has been set to $v = [1, 1, ... , 1]^T/n$. All the experiments are run in Python 3.10 on a 64-bit Arch Linux machine with an AMD Ryzen™ 5 2600 Processor and 16 GB of RAM.
\subsection{Technical details}
\begin{problem}
\centering
\url{https://github.com/lukefleed/ShfitedPowGMRES}
\end{problem}
\noindent In the project github repository we can find an \texttt{algo.py} file where all the functions used in the experiments are implemented. The \texttt{algo.py} file contains the following functions:
\paragraph{load\_data} This function loads the datasets from the \texttt{.txt} edge-list format and returns a networkx graph object. It takes as input a literal, the options are \texttt{web-stanford} and \texttt{web-BerkStan}.
\paragraph{pagerank} This function computes the PageRank vector of a given graph. It takes as input the following parameters:
\begin{itemize}
\item \texttt{G:} a networkx graph object.
\item \texttt{alpha:} Damping parameter for PageRank, default=$0.85$.
\item \texttt{personalization:} The "personalization vector" consisting of a dictionary with a key some subset of graph nodes and personalization value each of those. At least one personalization value must be non-zero. If not specified, a nodes personalization value will $1/N$ where $N$ is the number of nodes in \texttt{G}.
\item \texttt{max\_iter:} The maximum number of iterations in power method eigenvalue solver. Default is $200$.
\item \texttt{nstart:} Starting value of PageRank iteration for each node. Default is $None$.
\item \texttt{tol:} Error tolerance used to check convergence in power method solver. Default is $10^{-6}$.
\item \texttt{weight:} Edge data key corresponding to the edge weight. If None, then uniform weights are assumed. Default is $None$.
\item \texttt{dangling:} The outedges to be assigned to any "dangling" nodes, i.e., nodes without any outedges. The dict key is the node the outedge points to and the dict value is the weight of that outedge. By default, dangling nodes are given outedges according to the personalization vector (uniform if not specified).
\end{itemize}
This function is strongly based on the \texttt{pagerank\_scipy} function of the networkx library.
\paragraph{shifted\_pow\_pagerank}: This is the implementation of the algorithm \ref{alg:algo1} with the difference that I am using the $l1$ norm since the $l2$ norm is still not implemented for sparse matrices in SciPy.
\vspace{0.5cm}
\noindent There are is also another function called \texttt{pagerank\_numpy}. The eigenvector calculation uses NumPy's interface to the \texttt{LAPACK} eigenvalue solvers. This will be the fastest and most accurate for small graphs. Unfortunately, the eigenvector calculation is not stable for large graphs. Therefore, the \texttt{pagerank\_numpy} function is not used in the experiments.
\subsection{Convergence results for the Shifted Power method}
In the PageRank formulation with multiple damping factors, the iterative solution of each $i-th$ linear system is started from the initial guess $x_0^{(i)} = v$ and it's stopped when either the solution $x_k^{(i)}$ satisfies
\begin{equation*}
\frac{\lVert (1 - \alpha_i)v - (I - \alpha_i \tilde P x_k^{(i)} \rVert_2}{\lVert x_k^{(i)} \rVert_2} < 10^{-6}
\end{equation*}
or the number of matrix-vector products exceeds $200$. \vspace*{0.5cm}
\noindent In this experiment we test the performance of the shifted Power method against the conventional Power method for solving PageRank problems with multiple damping factors.
% create a table to store the results on each dataset for the two methods. We are interest in the mv and cpu time
\begin{table}[h]
\centering
\begin{tabular}{|c|c|c|c|}
\hline
\textbf{Dataset} & \textbf{Method} & \textbf{CPU Time (s)} & \textbf{mv} \\ \hline
\texttt{web-Stanford} & \texttt{Power} & $71.7$ & $70$ \\ \hline
\texttt{web-Stanford} & \texttt{Shifted Power} & $665.4$ & $56$ \\ \hline
\hline
\texttt{web-BerkStan} & \texttt{Power} & $202.1$ & $49$ \\ \hline
\texttt{web-BerkStan} & \texttt{Shifted Power} & $1342.9$ & $73$ \\ \hline
\end{tabular}
\caption{Summary of the experiments.}
\label{tab:results}
\end{table}
\noindent The results presented on table \ref{tab:results} are a bit in contrast compared to what the paper \cite{SHEN2022126799} reports. In their experiment the CPU time of the shifted power method is lower then the one of the standard power method. However, in our experiments the CPU time of the shifted power method is far higher then the one of the standard power method. Furthermore, theoretically, the number of matrix-vector products should be lower for the shifted power method, in particular it should be equal to the one of the standard PageRank algorithm with the biggest damping factor. However, in our experiments the number of matrix-vector products is higher for the shifted power method for the dataset \texttt{web-BerkStan} and lower for the dataset \texttt{web-Stanford}. \vspace*{0.5cm}
\noindent The reasons to those differences in results may be a lot. I think that the most plausible reason is the difference in programming language and implementation, combined with a possibility of misunderstanding of the pseudo-code presented in \cite{SHEN2022126799}. My standard PageRank function is a slightly modified version of the network library function \texttt{pagerank\_scipy}, so I suppose that is better optimized in comparison to the shifted power method implementation that I wrote. Also, the network \texttt{Web-BerkStan} is very different from the \texttt{web-stanford} one. The adjacency matrix relative to the first one, has a lot of rows full of zeros in comparison to the second one ($4744$ vs $172$). This might effect negatively the shifted power method for this specific cases of networks with a lot of dangling nodes. \vspace*{0.5cm}

@ -24,14 +24,3 @@ abstract = {}
month=1, month=1,
day=1, day=1,
} }
# cite this repo made by Luca Lombardo https://github.com/lukefleed/ShfitedPowGMRES
@misc{ShfitedPowGMRES,
author = {Lombardo, Luca},
title = {Shifted power method for solving PageRank with multiple damping factors},
year = {2022},
publisher = {},
journal = {GitHub repository},
howpublished = {\url{https://github.com/lukefleed/ShfitedPowGMRES}}
}

@ -0,0 +1,75 @@
\clearpage
\section{Shifted power-GMRES method}
In this section we'll cover the approach that the authors in \cite{SHEN2022126799} used to combine the shifted power method with the fast shifted \texttt{GMRES} method to create an hybrid algorithm for solving complex PageRank problems with multiple damping factors.
\subsection{Restarted GMRES method}
The Restarted GMRES method (hereafter referred to as GMRES in short) is a non-symmetric Krylov subspace solver based on the Arnoldi decomposition procedure, that the authors sketch in the following algorithm
\begin{algorithm}[H]
\caption{Arnoldi}
\label{alg:arnoldi}
\begin{algorithmic}[1]
\Require $A, v_0, m$
\Ensure $V_m, H_m, v_{m+1}, h_{m+1,m}, \beta, j$
\State Compute $\beta = \lVert v_0 \rVert$
\State $v_1 = v_0/ \beta$
\For {$j = 1:m$}
\State Compute $w = Av_j$
\For {$i = 1:j$}
\State Compute $h_{i,j} = v_i^T w$
\State Compute $w = w - h_{i,j} v_i$
\EndFor
\State $h_{j+1,j} = \lVert w_i \rVert$
\If {$h_{j+1,j} = 0$}
\State $m = j$,
\State $v_{m+1} = 0$
\State \textbf{break}
\Else
\State $v_{j+1} = w / h_{j+1,j}$
\EndIf
\EndFor
\end{algorithmic}
\end{algorithm}
\noindent Where $A \in \R^{n\times n}$ and $v_0 \in \R ^{n \times 1}$ is the initial vector. After $m$ iterations, the Arnoldi procedure produces the orthogonal basis $V_m = [v_1, \dots, v_m]$ and the upper Hessenberg matrix $H_m \in \R^{m\times m}$, and the residual vector $v_{m+1} \in \R^{n \times 1}$ and the residual norm $h_{m+1,m} \in \R$. Starting from $v_0 = b - Ax_0$ with an initial guess $x_0$, after running $m$ steps of the algorithm \ref{alg:arnoldi}, the \texttt{GMRES} method produces the approximate solution $\tilde x$ of the linear system $Ax = b$ that minimizes the residual norm $\lVert b - Ax \rVert$ in the Krylov subspace of dimension $m$. \vspace*{0.4cm}
\noindent We know that the accuracy of the approximate solution $\tilde x$ of \texttt{GMRES} depends heavily on the dimension $m$ of the search space. The authors in \cite{SHEN2022126799} propose to use the \texttt{GMRES} method as a preconditioner for the shifted power method presented in the previous section. The core idea of the method is to run standard GMRES on a seed system and to approximate the other solutions as by products. The theoretical basis is the shift-invariance property of the Krylov subspace that enables us to use only one Krylov subspace for all the shifted systems, provided that the residual vectors are collinear to one other. The algorithm proposed by the authors is presented in Algorithm \ref{alg:shifted_GMRES}.
\begin{algorithm}[H]
\caption{Shifted GMRES}
\label{alg:shifted_GMRES}
\begin{algorithmic}[1]
\Require $\tilde P, v, m, \alpha_i, maxit, x_0^i ~~ (1 \leq i \leq s)$
\Ensure $x^i, res_i ~~(1 \leq i \leq s), mv$
\State Set $_0^i = \frac{1-\alpha_i}{\alpha_i} v - \Big(\frac{1}{\alpha_i} I - \tilde P \Big) x_0^i$, iter = 1
\State Set $res_i = \alpha_i \lVert v \rVert ~~ (1 \leq i \leq s)$
\State Set mv = 0
\While {$\max (res_i) \geq \tau~~ \&\& ~~ iter \leq maxit$}
\State Find $k$ that satisfies $res_k = \max (res_i)$
\State Compute $\gamma^i = \frac{res_i \alpha_k}{res_k \alpha_i}$ for all $i \neq k$
\State Run Arnoldi by $ [V_m, \bar H_m^k, v_{m+1}, \bar h_{m+1,m}, \beta, j] = Arnoldi(\frac{1}{\alpha_k}I - \tilde P, r_0^k, m)$
\State Set $mv = mv + j$
\State Compute $y_k$, the minimizer of $\lVert \beta e_1 - \bar H_m^k y_k \rVert_2$
\State Compute $x^k = x_0^k + V_m y_k$
\State Compute $res_k = \alpha_k \lVert \beta e_1 - \bar H_m^k y^k \rVert$
\For {i = 1, 2, \dots , k-1, k+1, \dots , s}
\If {$res_i \geq \tau$}
\State Set $\bar H_m^i = \bar H_m^k + \Big( \frac{1-\alpha_i}{\alpha_i} - \frac{1-\alpha_k}{\alpha_k} \Big) I_m$
\State Solve $y_i$ and $\gamma_i$ from $\begin{bmatrix} \bar H_m^i & z \end{bmatrix} \begin{bmatrix} y^i \\ \gamma^i \end{bmatrix} = \gamma^i \beta e_1$
\State Set $x^i = x_0^i + V_m y^i$
\State Set $res_i = \frac{\alpha_i}{\alpha_k} \gamma_k^i res_k$
\EndIf
\EndFor
\State Set $iter = iter + 1$
\State Set $x_0^i = x^i$
\EndWhile
\end{algorithmic}
\end{algorithm}
\noindent Where $z = \beta e_1 - H_m^1 y_m^1$. In line 15, by solving this small size system, we can obtain the vector $y_m^i$ and scalar $\gamma_m^i$ that ensures the collinearity of the shifted results.
\paragraph{Problems:} The implementation of this algorithm has been very problematic. The key of this algorithm is the use of the \emph{seed choosing strategy} described in \cite{SHEN2022126799}. However, during my tests, after the second iteration, the $k$ value remains the same and the $res$ vector does not change. This leads obviously to a stall situation, where the program runs without updating the values until it reaches the maximum number of iterations allowed. This problem is still under investigation. I have provided anyway a notebook in the github repository with the code of the algorithm for completeness, even if it's still not working. I think that the problem is related to some misunderstanding of the algorithm provided in the pseudo-code, but I have not been able to find it yet. For this reason, there won't be any tests results for this algorithm in the following section.

74
tex/shifted.tex vendored

@ -0,0 +1,74 @@
\section{The shifted power method for PageRank computations}
In this section we'll see the extensions of stationary iterative methods for the solution of PageRank problems with multiple damping factors, as presented in \cite{SHEN2022126799}. We are interested in knowing if, for each method, there exists an implementation such that the computational cost of solving the PageRank problem with multiple damping factor is comparable to that of solving the ordinary PageRank problem with single damping factor.
\subsection{The implementation of the shifted power method}
Inspired by the reason why shifted Krylov subspaces can save computational cost, the authors of \cite{SHEN2022126799} investigate whether there are duplications in the calculations of multiple linear systems in this problem class by the stationary iterative methods, so that the duplications in the computation can be deleted and used for all systems. It's some sort of dynamic programming approach. Firstly, they analyze the Power method applied to the sequence of linear systems in \ref{eq:pr2}. It computes at the \emph{k-th} iteration approximate solutions $x_k^{(i)} (1 \leq i \leq s)$ of the form
\begin{equation}
x_k^{(i)} = \alpha_i^k \tilde P^k x_k^{(i)} + (1 - \alpha_i^k) \sum_{j=0}^{k-1} \alpha_i^j \tilde P^j v
\end{equation}
If the $s$ systems in \ref{eq:pr2} are solved synchronously, this means that all the $x^{(i)}_k$ are computed only after all previous approximations $x^{(j)}_{k-1}$ are available. We can now rearrange the computation efficiently as reported in \cite{SHEN2022126799}:
\begin{itemize}
\item at the first iterations
\begin{itemize}
\item compute and store $\mu_1 = \tilde P x_0$ and $\mu_2 = v$;
\item compute and store $x_1^{(i)} = \alpha_i \mu_1 + (1-\alpha_i)\mu_2;$
\end{itemize}
\item at any other subsequent iteration $k>1$
\begin{itemize}
\item compute and store $ x_k^{(i)} := (1-\alpha_i)\sum_{j=0}^{k-2} \alpha_i^j \tilde P^j v= x_{k-1}^{(i)} - \alpha_i^{k-1} \mu_1$;
\item compute and store $\mu_1 = \tilde P \mu_1$ and $\mu_2 = \tilde P \mu_2$;
\item compute and store $x_k^{(i)} = \alpha_i \mu_1 + x_k^{(i)} + (1-\alpha_i)\alpha^{k-1}_i \mu_2$.
\end{itemize}
\end{itemize}
This implementation requires at most $2$ matrix-vector products at each step, which is a significant gain compared to the $s$ matrix-vector products required by the standard Power method to compute $x^{(i)}_{k+1}$ , especially when $s \gg 2$. \vspace{0.4cm}
\noindent This was of course still a theoretical explanation. An efficient implementation can be written to compute and store $\mu = \tilde Pv -v$ at the first iteration and then store
$$\mu = \tilde P^{k-1}(\tilde P v - v) = \tilde P \cdot (\tilde P^{k-2}(\tilde P v - v))$$
at each \emph{k-th} iteration ($k > 1$), and then from each approximate solution as $x_k^{(i)} = \alpha_i^k \mu + x_{k-1}^{(i)}$. The residual vector $r_k^{(i)}$ associated with the approximate solution $x_k^{(i)}$ has the following expression
\begin{equation}
r_k^{(i)} = A x_k^{(i)} - x_k^{(i)} = x_{k+1}^{(i)} - x_k^{(i)} = \alpha_i^{k+1} \tilde P^k (\tilde P v - v)
\end{equation}
Since in general each of the $s$ linear systems may require a different number of Power iterations to converge, the $s$ residual norms have to be monitored separately to test the convergence. \vspace{0.4cm}
\noindent Now we can summarize the efficient implementation of the Power method presented in this section for solving problem \ref{eq:pr2} in Algorithm \ref{alg:algo1}, as reported in \cite{SHEN2022126799}. From now on, we'll refer to this implementation as the \emph{Shifted-Power method}.
\begin{algorithm}\label{alg:algo1}
\caption{Shifted-Power method for PageRank with multiple damping factors}\label{alg:algo1}
\begin{algorithmic}
\Require $\tilde P, ~v, ~\tau, ~\max_{mv}, ~\alpha_i ~ (1 \leq i \leq s)$
\Ensure $mv,~ x^{(i)},~ r^{(i)} ~ (1 \leq i \leq s)$
\State Compute $\mu = \tilde P v - v$
\State Set $mv =1$
\For {$i = 1:s$}
\State Compute $r^{(i)} = \alpha_i \mu$
\State Compute $Res(i) = \lVert r^{(i)} \rVert$
\If {$Res(i) \geq \tau$}
\State Compute $x^{(i)} = r^{(i)} + v$
\EndIf
\EndFor
\While {$\max(Res \geq \tau)$ and $ mv \leq \max_{mv}$}
\State compute $\mu = \tilde P \mu$
\State $mv = mv + 1$
\For {$i = 1:s$}
\If {$Res(i) \geq \tau$}
\State Compute $r^{(i)} = \alpha_i^{k+1} \mu$
\State Compute $Res(i) = \lVert r^{(i)} \rVert$
\If {$Res(i) \geq \tau$}
\State Compute $x^{(i)} = r^{(i)} + x^{(i)}$
\EndIf
\EndIf
\EndFor
\EndWhile
\end{algorithmic}
\end{algorithm}
\noindent Where $mv$ is an integer that counts the number of matrix-vector products performed by the algorithm. The algorithm stops when either all the residual norms are smaller than the tolerance $\tau$ or the maximum number of matrix-vector products is reached. An implementation of this algorithm written in Python is available in the github repository of this project.
\clearpage

@ -1,286 +0,0 @@
#!/usr/bin/env python3
# Importing the libraries
import os
import wget
import gzip
import time
import warnings
import scipy as sp
import numpy as np
import pandas as pd
import networkx as nx
from os.path import exists
from scipy.sparse import *
from scipy.sparse.linalg import norm
import plotly.graph_objs as go
warnings.simplefilter(action='ignore', category=FutureWarning)
# some stupid pandas function that doesn't work
class Utilities:
# Importing the dataset
def load_data():
# Loading the dataset
dataset = int(input("Choose the dataset:\n [1] web-Stanford (use this one for now)\n [2] web-BerkStan: \nEnter an option: "))
if dataset == 1:
if exists('../data/web-Stanford.txt'):
dataset = '../data/web-Stanford.txt'
else:
print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-Stanford.html")
# if there is no folder data, create it
if not exists('../data'):
os.makedirs('../data')
# Downloading the dataset
url = 'https://snap.stanford.edu/data/web-Stanford.txt.gz'
wget.download(url, '../data/web-Stanford.txt.gz')
# Unzipping the dataset
with gzip.open('../data/web-Stanford.txt.gz', 'rb') as f_in:
with open('../data/web-Stanford.txt', 'wb') as f_out:
f_out.write(f_in.read())
# delete the zipped file
os.remove('../data/web-Stanford.txt.gz')
dataset = '../data/web-Stanford.txt'
print("\nDataset downloaded\n")
elif dataset == 2:
if exists('../data/web-BerkStan.txt'):
dataset = '../data/web-BerkStan.txt'
else:
print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-BerkStan.html")
# if there is no folder data, create it
if not exists('../data'):
os.makedirs('../data')
# Downloading the dataset
url = 'https://snap.stanford.edu/data/web-BerkStan.txt.gz'
wget.download(url, '../data/web-BerkStan.txt.gz')
# Unzipping the dataset
with gzip.open('../data/web-BerkStan.txt.gz', 'rb') as f_in:
with open('../data/web-BerkStan.txt', 'wb') as f_out:
f_out.write(f_in.read())
# delete the zipped file
os.remove('../data/web-BerkStan.txt.gz')
dataset = '../data/web-BerkStan.txt'
print("\nDataset downloaded\n")
return dataset
# Creating the graph from the dataset
def create_graph(dataset):
print("\nCreating the graph...")
G = nx.read_edgelist(dataset, create_using=nx.DiGraph(), nodetype=int)
n = G.number_of_nodes()
print("Graph created based on the dataset\n")
return G, n
# The matrix is filled with zeros and the (i,j) element is x if the node i is connected to the node j. Where x is 1/(number of nodes connected to i).
def create_matrix(G):
print("Creating the transition probability matrix...")
P = sp.sparse.lil_matrix((n,n))
for i in G.nodes():
for j in G[i]: #G[i] is the list of nodes connected to i, it's neighbors
P[i-1,j-1] = 1/len(G[i])
print("Transition probability matrix created\n")
return P
# The vector is filled with d(i) = 1 if the i row of the matrix P is filled with zeros, other wise is 0
def dangling_nodes(P,n):
print("Creating the list of dangling nodes...")
d = sp.sparse.lil_matrix((n,1))
for i in range(n):
if P[i].sum() == 0:
d[i] = 1
print("List of dangling nodes created\n")
return d
# For now it is set to equally distribute the probability to all the nodes
def probability_vector(n):
print("Creating the probability vector...")
v = sp.sparse.lil_matrix((n,1))
for i in range(n):
v[i] = 1/n
print("Probability vector created\n")
return v
def transition_matrix(P, v, d):
print("Creating the transition matrix...")
Pt = P + v @ (d.T)
print("Transition matrix created\n")
return Pt
# it can vary from 0 to 1, the higher the value the more the probability to jump to a random page
def alpha():
a = []
for i in range(85,100):
a.append(i/100)
return a
# Class for plotting the results obtained. For now it can only the first algorithm. To be updated once all the algorithms are implemented and the test cases are well defined
class Plotting:
def tau_over_iterations(df):
x = df['tau'][::-1].tolist()
y = df['products m-v'].tolist()
fig = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'),
layout=go.Layout(title='products needed for the convergence', xaxis_title='tau', yaxis_title='products matrix vector'))
# save the figure as a html file
fig.write_html("../data/results/algo1/taus_over_prods.html")
print("The plot has been saved in the folder data/results/algo1")
def tau_over_time(df):
x1 = df['tau'][::-1].tolist()
y1 = df['time'].tolist()
fig = go.Figure(data=go.Scatter(x=x1, y=y1, mode='lines+markers'),
layout=go.Layout(title='Time needed for the convergence', xaxis_title='tau', yaxis_title='time (seconds)'))
# save the plot in a html file
fig.write_html("../data/results/algo1/taus_over_time.html")
print("The plot has been saved in the folder data/results/algo1")
class Algorithms:
# Power method adapted to the PageRank problem with different damping factors. Referred as Algorithm 1 in the paper
def algo1(Pt, v, tau, max_mv, a: list):
start_time = time.time()
print("STARTING ALGORITHM 1...")
u = Pt @ v - v
mv = 1 # number of matrix-vector multiplications
r = sp.sparse.lil_matrix((n,1))
Res = sp.sparse.lil_matrix((len(a),1))
x = sp.sparse.lil_matrix((n,1))
for i in range(len(a)):
r = a[i]*(u)
normed_r = norm(r)
Res[i] = normed_r
if Res[i] > tau:
x = r + v
while max(Res) > tau and mv < max_mv:
u = Pt @ u
mv += 1
for i in range(len(a)):
if Res[i] >= tau:
r = (a[i]**(mv+1))*(u)
Res[i] = norm(r)
if Res[i] > tau:
x = r + x
if mv == max_mv:
print("The algorithm didn't converge in ", max_mv, " iterations")
else:
print("The algorithm converged with ", mv, " matrix-vector multiplications executed")
total_time = time.time() - start_time
total_time = round(total_time, 2)
print("The algorithm took ", total_time, " seconds to run\n")
return mv, x, r, total_time
# Refers to Algorithm 2 in the paper
def Arnoldi(A,v0,m):
v = v0
beta = norm(v)
v = v/beta
H = sp.sparse.lil_matrix((m+1,m))
V = sp.sparse.lil_matrix((A.shape[0],m+1))
V[:,0] = v # each column of V is a vector v
for j in range(m):
# print("j = ", j)
w = A @ v
for i in range(j):
tmp = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory
H[i,j] = tmp[0,0]
w = w - H[i,j]*v
H[j+1,j] = norm(w)
if H[j+1,j] == 0:
print("Arnoldi breakdown")
m = j
v = 0
break
else:
if j < m-1:
v = w/H[j+1,j]
V[:,j+1] = v
print(j, " iterations completed")
print("V = ", V.shape)
print("H = ", H.shape)
print("v = ", v.shape)
print("beta = ", beta)
return V, H, v, beta, j
def algo4():
# TO DO
pass
class runners:
def ShiftedPowerMethod(tau):
dataset = Utilities.load_data()
max_mv = 100
G, n = Utilities.create_graph(dataset)
P = Utilities.create_matrix(G)
d = Utilities.dangling_nodes(P,n)
v = Utilities.probability_vector(n)
Pt = Utilities.transition_matrix(P, v, d)
a = Utilities.alpha()
mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a)
print("total time = ", total_time)
# pandas dataframe to store the results
df = pd.DataFrame(columns=['alpha', 'products m-v', 'tau', 'time'])
# Main
if __name__ == "__main__":
dataset = Utilities.load_data()
# maximum number of iterations, asked to the user
max_mv = int(input("\nInsert the maximum number of matrix-vector operations: "))
G, n = Utilities.create_graph(dataset)
P = Utilities.create_matrix(G)
d = Utilities.dangling_nodes(P,n)
v = Utilities.probability_vector(n)
Pt = Utilities.transition_matrix(P, v, d)
a = Utilities.alpha()
# run the algorithm for different values of tau from 10^-5 to 10^-9 with step 10^-1
for i in range(5,10):
tau = 10**(-i)
print("\ntau = ", tau)
mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a)
# store the results in the dataframe
df = df.append({'alpha': a, 'products m-v': mv, 'tau': tau, 'time': total_time}, ignore_index=True)
# save the results in a csv file
df.to_csv('../data/results/algo1/different_tau.csv', index=False)
# plot the results
Plotting.tau_over_iterations(df)
Plotting.tau_over_time(df)
# print in the terminal the columns of the dataframe iterations, tau and time
print("Computations done. Here are the results:")
print("\n", df[['products m-v', 'tau', 'time']])

565031
tmp/testing.ipynb vendored

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save