diff --git a/src/algo1_testing.ipynb b/src/algo1_testing.ipynb index 8ea4ae2..0e6d9b5 100644 --- a/src/algo1_testing.ipynb +++ b/src/algo1_testing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 35, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -20,6 +20,13 @@ "from scipy.optimize import least_squares" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ALGORITHM 1 TESTING\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -35,14 +42,14 @@ "source": [ "G1 = nx.read_edgelist('../data/web-Stanford.txt', create_using=nx.DiGraph(), nodetype=int)\n", "\n", - "# G2 = nx.read_edgelist('../data/web-BerkStan.txt', create_using=nx.DiGraph(), nodetype=int)" + "G2 = nx.read_edgelist('../data/web-BerkStan.txt', create_using=nx.DiGraph(), nodetype=int)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Creating the transition probability matrix" + "Creating the transition probability matrix" ] }, { @@ -180,7 +187,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Algorithm 1 Shifted-Power method for PageRank with multiple damping factors:" + "Algorithm 1 Shifted-Power method for PageRank with multiple damping factors:" ] }, { @@ -199,8 +206,6 @@ "metadata": {}, "outputs": [], "source": [ - "# this should return mv (the number of iteration needed for the convergence), and two vector called x and r. Where x is the vector of the pagerank and r is the residual vector\n", - "\n", "def Algorithm1(Pt, v, tau, max_mv, a: list):\n", " \n", " start_time = time.time()\n", @@ -220,7 +225,7 @@ " x = r + v \n", "\n", " while max(Res) > tau and mv < max_mv:\n", - " u = Pt*u # should it be the same u of the beginning?\n", + " u = Pt*u \n", " mv += 1 \n", "\n", " for i in range(len(a)):\n", @@ -242,6 +247,13 @@ " return mv, x, r, total_time " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Testing setup" + ] + }, { "cell_type": "code", "execution_count": null, @@ -260,8 +272,7 @@ " tau = 10**(-i)\n", " print(\"\\ntau = \", tau)\n", " mv, x, r, total_time = Algorithm1(Pt, v, tau, max_mv, a)\n", - " df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True) \n", - "\n" + " df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True) " ] }, { @@ -278,7 +289,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Plotting the results of the algorithm for different values of tau, and fixed alpha" + "Plotting the results of the algorithm for different values of tau, and fixed alpha" ] }, { @@ -288,170 +299,26 @@ "outputs": [], "source": [ "x = df['tau'][::-1].tolist()\n", - "y = df['iterations'].tolist()\n", + "y = df['products m-v'].tolist()\n", "\n", - "fig1 = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'), \n", - " layout=go.Layout(title='Iterations needed for the convergence', xaxis_title='tau', yaxis_title='iterations'))\n", - " \n", - "# save the plot in a html file\n", - "fig1.write_html(\"../data/results/algo1/taus_over_iterations.html\")\n", + "fig = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'),\n", + " layout=go.Layout(title='products needed for the convergence', xaxis_title='tau', yaxis_title='products matrix vector'))\n", + "\n", + "# save the figure as a html file\n", + "fig.write_html(\"../data/results/algo1/taus_over_prods.html\")\n", + "print(\"The plot has been saved in the folder data/results/algo1\")\n", "\n", "##### RESULTS OVER TIME #####\n", "\n", "x1 = df['tau'][::-1].tolist()\n", "y1 = df['time'].tolist()\n", "\n", - "fig2 = go.Figure(data=go.Scatter(x=x1, y=y1, mode='lines+markers'),\n", + "fig = go.Figure(data=go.Scatter(x=x1, y=y1, mode='lines+markers'),\n", " layout=go.Layout(title='Time needed for the convergence', xaxis_title='tau', yaxis_title='time (seconds)'))\n", "\n", "# save the plot in a html file\n", - "fig2.write_html(\"../data/results/algo1/taus_over_time.html\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To view the graph just use the command\n", - "\n", - "```bash\n", - "firefox taus_over_iterations.html \n", - "```\n", - "or \n", - "\n", - "```bash\n", - "firefox taus_over_time.html\n", - "```\n", - "\n", - "_In the right folder_" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "def Arnoldi(A, v, m): # defined ad algorithm 2 in the paper\n", - " beta = norm(v)\n", - " print(\"A\")\n", - " v = v/beta\n", - " print(\"B\")\n", - " h = sp.sparse.lil_matrix((m,m))\n", - " print(\"C\")\n", - "\n", - " for j in range(m):\n", - " w = A.dot(v)\n", - " print(\"D\")\n", - " for i in range(j):\n", - " h[i,j] = v.T.dot(w)\n", - " print(\"E\")\n", - " w = w - h[i,j]*v[i]\n", - " print(\"F\")\n", - "\n", - " h[j+1,j] = norm(w)\n", - " print(\"G\")\n", - "\n", - " if h[j+1,j] == 0:\n", - " print(\"The algorithm didn't converge\")\n", - " m = j\n", - " v[m+1] = 0\n", - " break\n", - " else:\n", - " print(\"H\")\n", - " v[j+1] = w**h[j+1,j]\n", - " print(\"I\")\n", - "\n", - " return v, h, m, beta, j" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "A = sp.sparse.rand(100,100, density=0.5, format='lil')\n", - "v = sp.sparse.rand(100,1, density=1, format='lil')\n", - "m = 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "v, h, m, beta, j = Arnoldi(A, v, m)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def Algo4(Pt, v, m, a: list, tau, maxit: int, x):\n", - " \n", - " iter = 1\n", - " mv = 0\n", - " e1 = sp.sparse.lil_matrix((1,n))\n", - " e1[0,0] = 1\n", - " x = sp.sparse.lil_matrix((len(a),1))\n", - " I = sp.sparse.eye(n, n, format='lil')\n", - " res = sp.sparse.lil_matrix((len(a),1))\n", - " r = sp.sparse.lil_matrix((n,1))\n", - " y = sp.sparse.lil_matrix((n,1))\n", - "\n", - " for i in range(len(a)):\n", - " r = ((1-a[i])**a[i])*v - ((1**a[i])*I - Pt).dot(x)\n", - " res[i] = a[i]*norm(r)\n", - "\n", - " def Find_k(res, maxit):\n", - " k = 0\n", - " for i in range(len(a)):\n", - " if res[i] == max(res):\n", - " k = i\n", - " break\n", - " return k\n", - "\n", - " def Find_gamma(res, a, k):\n", - " gamma = sp.sparse.lil_matrix((len(a),1))\n", - " for i in range(len(a)):\n", - " if i != k:\n", - " gamma[i] = (res[i]*a[k])/(res[k]*a[i])\n", - " else:\n", - " gamma[i] = 0\n", - " return gamma\n", - "\n", - "\n", - " while max(res) > tau and iter < maxit:\n", - " k = Find_k(res, maxit)\n", - " gamma = Find_gamma(res, a, k)\n", - " v, h, m, beta, j = Arnoldi((1**a[k])*I - Pt, r, m)\n", - " Hbar = sp.sparse.lil_matrix((m+1,m))\n", - " Hbar[0:m,0:m] = h\n", - " Hbar[m+1,0:m] = e1\n", - "\n", - " mv += j\n", - "\n", - " # solve the least squares problem for Hbar*x = beta*e1\n", - " y = sp.sparse.linalg.least_squares(Hbar, beta*e1)\n", - " res[k] = a[k]*norm(beta*e1 - Hbar*y)\n", - " x[k] = x[k] + v*y[k]\n", - "\n", - " for i in range(len(a)):\n", - " if i != k:\n", - " if res[i] >= tau:\n", - " Hbar[i] = Hbar[k] + ((1-a[i])/a[i] - (1-a[k])/a[k])*I\n", - " z = beta*e1 - Hbar*y\n", - " y = sp.sparse.linalg.solve(Hbar, gamma*beta*e1)\n", - " x = x + v*y\n", - " res[i] = a[i]**a[k]*gamma[i]*res[k]\n", - " \n", - " iter += 1\n", - " \n", - " return x, res, mv\n" + "fig.write_html(\"../data/results/algo1/taus_over_time.html\")\n", + "print(\"The plot has been saved in the folder data/results/algo1\")\n" ] } ], diff --git a/src/algo2_testing.ipynb b/src/algo2_testing.ipynb new file mode 100644 index 0000000..bd4b836 --- /dev/null +++ b/src/algo2_testing.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import networkx as nx\n", + "import time\n", + "import math\n", + "import pandas as pd\n", + "import scipy as sp\n", + "import plotly.express as px\n", + "import plotly.graph_objs as go\n", + "from scipy.sparse import *\n", + "from scipy import linalg\n", + "from scipy.sparse.linalg import norm\n", + "from scipy.optimize import least_squares" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Algorithm 2 testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def Arnoldi(A, v, m): # defined ad algorithm 2 in the paper\n", + " beta = norm(v)\n", + " print(\"A\")\n", + " v = v/beta\n", + " print(\"B\")\n", + " h = sp.sparse.lil_matrix((m,m))\n", + " print(\"C\")\n", + "\n", + " for j in range(m):\n", + " w = A.dot(v)\n", + " print(\"D\")\n", + " for i in range(j):\n", + " h[i,j] = v.T.dot(w)\n", + " print(\"E\")\n", + " w = w - h[i,j]*v[i]\n", + " print(\"F\")\n", + "\n", + " h[j+1,j] = norm(w)\n", + " print(\"G\")\n", + "\n", + " if h[j+1,j] == 0:\n", + " print(\"The algorithm didn't converge\")\n", + " m = j\n", + " v[m+1] = 0\n", + " break\n", + " else:\n", + " print(\"H\")\n", + " v[j+1] = w**h[j+1,j] # THIS IS WRONG, I DON'T KNOW HOW TO FIX IT. ERROR \" matrix is not square\"\n", + " print(\"I\")\n", + "\n", + " return v, h, m, beta, j" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creating a small test case" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "A = sp.sparse.rand(100,100, density=0.5, format='lil')\n", + "v = sp.sparse.rand(100,1, density=0.5, format='lil')\n", + "m = 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "v, h, m, beta, j = Arnoldi(A, v, m)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/algo4_testing.ipynb b/src/algo4_testing.ipynb new file mode 100644 index 0000000..637cc0e --- /dev/null +++ b/src/algo4_testing.ipynb @@ -0,0 +1,174 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import networkx as nx\n", + "import time\n", + "import math\n", + "import pandas as pd\n", + "import scipy as sp\n", + "import plotly.express as px\n", + "import plotly.graph_objs as go\n", + "from scipy.sparse import *\n", + "from scipy import linalg\n", + "from scipy.sparse.linalg import norm\n", + "from scipy.optimize import least_squares" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function is needed in the algorithm. Note that this is a NON-functioning version, for now it's just a place holder. When algo2_testing will be completed, this will be updated and I'll work on algo4_testing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def Arnoldi(A, v, m): # defined ad algorithm 2 in the paper\n", + " beta = norm(v)\n", + " print(\"A\")\n", + " v = v/beta\n", + " print(\"B\")\n", + " h = sp.sparse.lil_matrix((m,m))\n", + " print(\"C\")\n", + "\n", + " for j in range(m):\n", + " w = A.dot(v)\n", + " print(\"D\")\n", + " for i in range(j):\n", + " h[i,j] = v.T.dot(w)\n", + " print(\"E\")\n", + " w = w - h[i,j]*v[i]\n", + " print(\"F\")\n", + "\n", + " h[j+1,j] = norm(w)\n", + " print(\"G\")\n", + "\n", + " if h[j+1,j] == 0:\n", + " print(\"The algorithm didn't converge\")\n", + " m = j\n", + " v[m+1] = 0\n", + " break\n", + " else:\n", + " print(\"H\")\n", + " v[j+1] = w**h[j+1,j] # THIS IS WRONG, I DON'T KNOW HOW TO FIX IT. ERROR \" matrix is not square\"\n", + " print(\"I\")\n", + "\n", + " return v, h, m, beta, j" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Algorithm 4 testing\n", + "\n", + "Still a complete mess. Conceptually and technically wrong. I'll work on it when algo2_testing will be completed." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def Algo4(Pt, v, m, a: list, tau, maxit: int, x):\n", + " \n", + " iter = 1\n", + " mv = 0\n", + " e1 = sp.sparse.lil_matrix((1,n))\n", + " e1[0,0] = 1\n", + " x = sp.sparse.lil_matrix((len(a),1))\n", + " I = sp.sparse.eye(n, n, format='lil')\n", + " res = sp.sparse.lil_matrix((len(a),1))\n", + " r = sp.sparse.lil_matrix((n,1))\n", + " y = sp.sparse.lil_matrix((n,1))\n", + "\n", + " for i in range(len(a)): # I don't think that this is what was intended in the pseudocode... \n", + " r = ((1-a[i])**a[i])*v - ((1**a[i])*I - Pt).dot(x)\n", + " res[i] = a[i]*norm(r)\n", + "\n", + " def Find_k(res, maxit):\n", + " k = 0\n", + " for i in range(len(a)):\n", + " if res[i] == max(res):\n", + " k = i\n", + " break\n", + " return k\n", + "\n", + " def Find_gamma(res, a, k):\n", + " gamma = sp.sparse.lil_matrix((len(a),1))\n", + " for i in range(len(a)):\n", + " if i != k:\n", + " gamma[i] = (res[i]*a[k])/(res[k]*a[i])\n", + " else:\n", + " gamma[i] = 0\n", + " return gamma\n", + "\n", + "\n", + " while max(res) > tau and iter < maxit:\n", + " k = Find_k(res, maxit)\n", + " gamma = Find_gamma(res, a, k)\n", + " v, h, m, beta, j = Arnoldi((1**a[k])*I - Pt, r, m)\n", + " Hbar = sp.sparse.lil_matrix((m+1,m))\n", + " Hbar[0:m,0:m] = h\n", + " Hbar[m+1,0:m] = e1\n", + "\n", + " mv += j\n", + "\n", + " y = sp.sparse.linalg.least_squares(Hbar, beta*e1)\n", + " res[k] = a[k]*norm(beta*e1 - Hbar*y)\n", + " x[k] = x[k] + v*y[k]\n", + "\n", + " for i in range(len(a)):\n", + " if i != k:\n", + " if res[i] >= tau:\n", + " Hbar[i] = Hbar[k] + ((1-a[i])/a[i] - (1-a[k])/a[k])*I\n", + " z = beta*e1 - Hbar*y\n", + " y = sp.sparse.linalg.solve(Hbar, gamma*beta*e1)\n", + " x = x + v*y\n", + " res[i] = a[i]**a[k]*gamma[i]*res[k]\n", + " \n", + " iter += 1\n", + " \n", + " return x, res, mv\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/main.py b/src/main.py index 09ea0d0..bfdc970 100755 --- a/src/main.py +++ b/src/main.py @@ -22,7 +22,7 @@ class Utilities: # Importing the dataset def load_data(): # Loading the dataset - dataset = int(input("Choose the dataset:\n [1] web-Stanford\n [2] web-BerkStan: \nEnter an option: ")) + dataset = int(input("Choose the dataset:\n [1] web-Stanford (use this one for now)\n [2] web-BerkStan: \nEnter an option: ")) if dataset == 1: if exists('../data/web-Stanford.txt'): @@ -82,7 +82,6 @@ class Utilities: print("Graph created based on the dataset\n") return G, n - # # Creating the transition probability matrix # The matrix is filled with zeros and the (i,j) element is x if the node i is connected to the node j. Where x is 1/(number of nodes connected to i). def create_matrix(G): print("Creating the transition probability matrix...") @@ -103,6 +102,7 @@ class Utilities: print("List of dangling nodes created\n") return d + # For now it is set to equally distribute the probability to all the nodes def probability_vector(n): print("Creating the probability vector...") v = sp.sparse.lil_matrix((n,1)) @@ -117,22 +117,24 @@ class Utilities: print("Transition matrix created\n") return Pt + # it can vary from 0 to 1, the higher the value the more the probability to jump to a random page def alpha(): a = [] for i in range(85,100): a.append(i/100) return a +# Class for plotting the results obtained. For now it can only the first algorithm. To be updated once all the algorithms are implemented and the test cases are well defined class Plotting: def tau_over_iterations(df): x = df['tau'][::-1].tolist() - y = df['iterations'].tolist() + y = df['products m-v'].tolist() fig = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'), - layout=go.Layout(title='Iterations needed for the convergence', xaxis_title='tau', yaxis_title='iterations')) + layout=go.Layout(title='products needed for the convergence', xaxis_title='tau', yaxis_title='products matrix vector')) # save the figure as a html file - fig.write_html("../data/results/algo1/taus_over_iterations.html") + fig.write_html("../data/results/algo1/taus_over_prods.html") print("The plot has been saved in the folder data/results/algo1") def tau_over_time(df): @@ -147,7 +149,7 @@ class Plotting: print("The plot has been saved in the folder data/results/algo1") class Algorithms: - + # Power method adapted to the PageRank problem with different damping factors. Referred as Algorithm 1 in the paper def algo1(Pt, v, tau, max_mv, a: list): start_time = time.time() @@ -190,7 +192,8 @@ class Algorithms: return mv, x, r, total_time - def Arnoldi(A, v, m): # defined ad algorithm 2 in the paper + # Refers to Algorithm 2 in the paper, it's needed to implement the algorithm 4. It doesn't work yet. Refer to the file testing.ipynb for more details. This function down here is just a place holder for now + def Arnoldi(A, v, m): beta = norm(v) v = v/beta h = sp.sparse.lil_matrix((m,m)) @@ -212,7 +215,7 @@ class Algorithms: return v, h, m, beta, j # pandas dataframe to store the results -df = pd.DataFrame(columns=['alpha', 'iterations', 'tau', 'time']) +df = pd.DataFrame(columns=['alpha', 'products m-v', 'tau', 'time']) # Main if __name__ == "__main__": @@ -234,7 +237,7 @@ if __name__ == "__main__": mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a) # store the results in the dataframe - df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True) + df = df.append({'alpha': a, 'products m-v': mv, 'tau': tau, 'time': total_time}, ignore_index=True) # save the results in a csv file df.to_csv('../data/results/algo1/different_tau.csv', index=False) @@ -245,4 +248,4 @@ if __name__ == "__main__": # print in the terminal the columns of the dataframe iterations, tau and time print("Computations done. Here are the results:") - print("\n", df[['iterations', 'tau', 'time']]) + print("\n", df[['products m-v', 'tau', 'time']])