removed old notebooks and unseful scripts

main
Luca Lombardo 2 years ago
parent 2df71072a6
commit 3e08fee18d

@ -1,353 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import networkx as nx\n",
"import time\n",
"import math\n",
"import pandas as pd\n",
"import scipy as sp\n",
"import plotly.express as px\n",
"import plotly.graph_objs as go\n",
"from scipy.sparse import *\n",
"from scipy import linalg\n",
"from scipy.sparse.linalg import norm\n",
"from scipy.optimize import least_squares"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ALGORITHM 1 TESTING\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's create two graphs from the list of edges downloaded from the Snap database. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G1 = nx.read_edgelist('../data/web-Stanford.txt', create_using=nx.DiGraph(), nodetype=int)\n",
"\n",
"G2 = nx.read_edgelist('../data/web-BerkStan.txt', create_using=nx.DiGraph(), nodetype=int)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Creating the transition probability matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# square matrix of size n x n, where n is the number of nodes in the graph. The matrix is filled with zeros and the (i,j) element is x if the node i is connected to the node j. Where x is 1/(number of nodes connected to i).\n",
"\n",
"def create_matrix(G):\n",
" n = G.number_of_nodes()\n",
" P = sp.sparse.lil_matrix((n,n))\n",
" for i in G.nodes():\n",
" for j in G[i]: #G[i] is the list of nodes connected to i, it's neighbors\n",
" P[i-1,j-1] = 1/len(G[i])\n",
" return P"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To ensure that the random process has a unique stationary distribution and it will not stagnate, the transition matrix P is usually modified to be an irreducible stochastic matrix A (called the Google matrix) as follows\n",
"\n",
"$$ A = \\alpha \\tilde{P} + (1-\\alpha)v e^T$$\n",
"\n",
"Where $\\tilde{P}$ is defined as \n",
"\n",
"$$ \\tilde{P} = P + v d^T$$\n",
"\n",
"Where $d \\in \\mathbb{N}^{n \\times 1}$ s a binary vector tracing the indices of dangling web-pages with no hyperlinks, i.e., $d(i ) = 1$ if the `ith` page has no hyperlink, $v \\in \\mathbb{R}^{n \\times 1}$ is a probability vector, $e = [1, 1, . . . , 1]^T$ , and $0 < \\alpha < 1$ is the so-called damping factor that represents the probability in the model that the surfer transfer by clicking a hyperlink rather than other ways"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n = G1.number_of_nodes()\n",
"P = create_matrix(G1) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"the vector `d` solves the dangling nodes problem"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# define d as a nx1 sparse matrix, where n is the number of nodes in the graph. The vector is filled with d(i) = 1 if the i row of the matrix P is filled with zeros, other wise is 0\n",
"\n",
"# d is the vector of dangling nodes\n",
"d = sp.sparse.lil_matrix((n,1))\n",
"for i in range(n):\n",
" if P[i].sum() == 0:\n",
" d[i] = 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The vector v is a probability vector, the sum of its elements bust be one"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# define v as the probability vector of size n x 1, where n is the number of nodes in the graph. The vector is filled with 1/n\n",
"\n",
"v = sp.sparse.lil_matrix((n,1))\n",
"for i in range(n):\n",
" v[i] = 1/n \n",
"\n",
"# maybe I could have done it with a lambda more concisely? idk"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can compute the transition matrix\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Pt = P + v.dot(d.T)\n",
"\n",
"# Pt is a sparse matrix too"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# e is a nx1 sparse matrix filled with ones\n",
"e = sp.sparse.lil_matrix((1,n))\n",
"for i in range(n):\n",
" e[0,i] = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # v*eT is a nxn sparse matrix filled all with 1/n, let's call it B\n",
"\n",
"# B = sp.sparse.lil_matrix((n,n))\n",
"# for i in range(n):\n",
"# for j in range(n):\n",
"# B[i,j] = 1/n\n",
"\n",
"# A = alpha*Pt + (1-alpha)*B"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Algorithm 1 Shifted-Power method for PageRank with multiple damping factors:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# pandas dataframe to store the results\n",
"df = pd.DataFrame(columns=['alpha', 'iterations', 'tau', 'time'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def Algorithm1(Pt, v, tau, max_mv, a: list):\n",
" \n",
" start_time = time.time()\n",
"\n",
" u = Pt.dot(v) - v \n",
" mv = 1 # number of matrix vector products\n",
" r = sp.sparse.lil_matrix((n,1)) \n",
" Res = sp.sparse.lil_matrix((len(a),1))\n",
" x = sp.sparse.lil_matrix((n,1)) \n",
"\n",
" for i in range(len(a)):\n",
" r = a[i]*(u) \n",
" normed_r = norm(r)\n",
" Res[i] = normed_r \n",
"\n",
" if Res[i] > tau:\n",
" x = r + v \n",
"\n",
" while max(Res) > tau and mv < max_mv:\n",
" u = Pt*u \n",
" mv += 1 \n",
"\n",
" for i in range(len(a)):\n",
" if Res[i] >= tau: \n",
" r = (a[i]**(mv+1))*(u)\n",
" Res[i] = norm(r)\n",
"\n",
" if Res[i] > tau:\n",
" x = r + x\n",
"\n",
" if mv == max_mv:\n",
" print(\"The algorithm didn't converge in \", max_mv, \" iterations\")\n",
" else:\n",
" print(\"The algorithm converged in \", mv, \" iterations\")\n",
"\n",
" total_time = time.time() - start_time\n",
" print(\"The algorithm took \", total_time, \" seconds\")\n",
" \n",
" return mv, x, r, total_time "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Testing setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# list of alpha values, from 0.85 to 0.99 with step 0.01\n",
"a = []\n",
"for i in range(85,100):\n",
" a.append(i/100)\n",
"\n",
"max_mv = 1000\n",
"\n",
"# run the algorithm for different values of tau from 10^-5 to 10^-9 with step 10^-1\n",
"for i in range(5,10):\n",
" tau = 10**(-i)\n",
" print(\"\\ntau = \", tau)\n",
" mv, x, r, total_time = Algorithm1(Pt, v, tau, max_mv, a)\n",
" df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# save the results in a csv file\n",
"df.to_csv('../data/results/algo1/different_tau.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Plotting the results of the algorithm for different values of tau, and fixed alpha"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x = df['tau'][::-1].tolist()\n",
"y = df['products m-v'].tolist()\n",
"\n",
"fig = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'),\n",
" layout=go.Layout(title='products needed for the convergence', xaxis_title='tau', yaxis_title='products matrix vector'))\n",
"\n",
"# save the figure as a html file\n",
"fig.write_html(\"../data/results/algo1/taus_over_prods.html\")\n",
"print(\"The plot has been saved in the folder data/results/algo1\")\n",
"\n",
"##### RESULTS OVER TIME #####\n",
"\n",
"x1 = df['tau'][::-1].tolist()\n",
"y1 = df['time'].tolist()\n",
"\n",
"fig = go.Figure(data=go.Scatter(x=x1, y=y1, mode='lines+markers'),\n",
" layout=go.Layout(title='Time needed for the convergence', xaxis_title='tau', yaxis_title='time (seconds)'))\n",
"\n",
"# save the plot in a html file\n",
"fig.write_html(\"../data/results/algo1/taus_over_time.html\")\n",
"print(\"The plot has been saved in the folder data/results/algo1\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.6 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -1,184 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import networkx as nx\n",
"import time\n",
"import math\n",
"import pandas as pd\n",
"import scipy as sp\n",
"import plotly.express as px\n",
"import plotly.graph_objs as go\n",
"from scipy.sparse import *\n",
"from scipy import linalg\n",
"from scipy.sparse.linalg import norm\n",
"from scipy.optimize import least_squares"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Algorithm 2 testing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# def Arnoldi(A, v, m): \n",
"# beta = norm(v)\n",
"# v = v/beta # dimension of v is n x 1\n",
"# H = sp.sparse.lil_matrix((m,m)) # dimension of H is m x m \n",
"# V = sp.sparse.lil_matrix((A.shape[0],m))\n",
"# V[:,0] = v # each column of V is a vector v\n",
"\n",
"# for j in range(m):\n",
"# print(\"j = \", j)\n",
"# w = A @ v \n",
"# for i in range(j):\n",
"# tmp = v.T @ w \n",
"# H[i,j] = tmp[0,0]\n",
"# w = w - H[i,j]*v \n",
" \n",
"# H[j,j-1] = norm(w) \n",
"\n",
"# if H[j,j-1] == 0: \n",
"# print(\"Arnoldi breakdown\")\n",
"# m = j\n",
"# v = 0\n",
"# break\n",
"# else:\n",
"# if j < m-1:\n",
"# v = w/H[j,j-1]\n",
"# # for i in range(A.shape[0]):\n",
"# # V[i,j+1] = v[i,0]\n",
"# V[:,j+1] = v\n",
"\n",
"# print(j, \" iterations completed\")\n",
"# print(\"V = \", V.shape)\n",
"# print(\"H = \", H.shape)\n",
"# print(\"v = \", v.shape)\n",
"# print(\"beta = \", beta)\n",
"\n",
"# return V, H, v, beta, j"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Defined as Algorithm 2 in the paper. It's needed since it's called by Algorithm 4"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def Arnoldi(A,v0,m):\n",
" v = v0\n",
" beta = norm(v)\n",
" v = v/beta\n",
" H = sp.sparse.lil_matrix((m+1,m)) \n",
" V = sp.sparse.lil_matrix((A.shape[0],m+1))\n",
" V[:,0] = v # each column of V is a vector v\n",
"\n",
" for j in range(m):\n",
" # print(\"j = \", j)\n",
" w = A @ v \n",
" for i in range(j):\n",
" tmp = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory\n",
" H[i,j] = tmp[0,0] \n",
" w = w - H[i,j]*v \n",
" \n",
" H[j+1,j] = norm(w)\n",
"\n",
" if H[j+1,j] == 0:\n",
" print(\"Arnoldi breakdown\")\n",
" m = j\n",
" v = 0\n",
" break\n",
" else:\n",
" if j < m-1:\n",
" v = w/H[j+1,j]\n",
" V[:,j+1] = v\n",
"\n",
" print(j, \" iterations completed\")\n",
" print(\"V = \", V.shape)\n",
" print(\"H = \", H.shape)\n",
" print(\"v = \", v.shape)\n",
" print(\"beta = \", beta)\n",
"\n",
" return V, H, v, beta, j \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Creating a small test case"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"m = 100\n",
"n = 110\n",
"A = sp.sparse.rand(n,n, density=0.1, format='lil')\n",
"# generate a probability vector, with all the entries as 1/n\n",
"v = sp.sparse.lil_matrix((n,1))\n",
"for i in range(n):\n",
" v[i] = 1/n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Arnoldi(A,v,m)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.6 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -1,291 +0,0 @@
#!/usr/bin/env python3
# Importing the libraries
import os
import wget
import gzip
import time
import warnings
import scipy as sp
import numpy as np
import pandas as pd
import networkx as nx
from os.path import exists
from scipy.sparse import *
from scipy.sparse.linalg import norm
import plotly.graph_objs as go
warnings.simplefilter(action='ignore', category=FutureWarning)
# some stupid pandas function that doesn't work
class Utilities:
# Importing the dataset
def load_data():
# Loading the dataset
dataset = int(input("Choose the dataset:\n [1] web-Stanford (use this one for now)\n [2] web-BerkStan: \nEnter an option: "))
if dataset == 1:
if exists('../data/web-Stanford.txt'):
dataset = '../data/web-Stanford.txt'
else:
print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-Stanford.html")
# if there is no folder data, create it
if not exists('../data'):
os.makedirs('../data')
# Downloading the dataset
url = 'https://snap.stanford.edu/data/web-Stanford.txt.gz'
wget.download(url, '../data/web-Stanford.txt.gz')
# Unzipping the dataset
with gzip.open('../data/web-Stanford.txt.gz', 'rb') as f_in:
with open('../data/web-Stanford.txt', 'wb') as f_out:
f_out.write(f_in.read())
# delete the zipped file
os.remove('../data/web-Stanford.txt.gz')
dataset = '../data/web-Stanford.txt'
print("\nDataset downloaded\n")
elif dataset == 2:
if exists('../data/web-BerkStan.txt'):
dataset = '../data/web-BerkStan.txt'
else:
print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-BerkStan.html")
# if there is no folder data, create it
if not exists('../data'):
os.makedirs('../data')
# Downloading the dataset
url = 'https://snap.stanford.edu/data/web-BerkStan.txt.gz'
wget.download(url, '../data/web-BerkStan.txt.gz')
# Unzipping the dataset
with gzip.open('../data/web-BerkStan.txt.gz', 'rb') as f_in:
with open('../data/web-BerkStan.txt', 'wb') as f_out:
f_out.write(f_in.read())
# delete the zipped file
os.remove('../data/web-BerkStan.txt.gz')
dataset = '../data/web-BerkStan.txt'
print("\nDataset downloaded\n")
return dataset
# Creating the graph from the dataset
def create_graph(dataset):
print("\nCreating the graph...")
G = nx.read_edgelist(dataset, create_using=nx.DiGraph(), nodetype=int)
n = G.number_of_nodes()
print("Graph created based on the dataset\n")
return G, n
# The matrix is filled with zeros and the (i,j) element is x if the node i is connected to the node j. Where x is 1/(number of nodes connected to i).
def create_matrix(G):
print("Creating the transition probability matrix...")
P = sp.sparse.lil_matrix((n,n))
for i in G.nodes():
for j in G[i]: #G[i] is the list of nodes connected to i, it's neighbors
P[i-1,j-1] = 1/len(G[i])
print("Transition probability matrix created\n")
return P
# The vector is filled with d(i) = 1 if the i row of the matrix P is filled with zeros, other wise is 0
def dangling_nodes(P,n):
print("Creating the list of dangling nodes...")
d = sp.sparse.lil_matrix((n,1))
for i in range(n):
if P[i].sum() == 0:
d[i] = 1
print("List of dangling nodes created\n")
return d
# For now it is set to equally distribute the probability to all the nodes
def probability_vector(n):
print("Creating the probability vector...")
v = sp.sparse.lil_matrix((n,1))
for i in range(n):
v[i] = 1/n
print("Probability vector created\n")
return v
def transition_matrix(P, v, d):
print("Creating the transition matrix...")
Pt = P + v @ (d.T)
print("Transition matrix created\n")
return Pt
# it can vary from 0 to 1, the higher the value the more the probability to jump to a random page
def alpha():
a = []
for i in range(85,100):
a.append(i/100)
return a
# Class for plotting the results obtained. For now it can only the first algorithm. To be updated once all the algorithms are implemented and the test cases are well defined
class Plotting:
def tau_over_iterations(df):
x = df['tau'][::-1].tolist()
y = df['products m-v'].tolist()
fig = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'),
layout=go.Layout(title='products needed for the convergence', xaxis_title='tau', yaxis_title='products matrix vector'))
# save the figure as a html file
fig.write_html("../data/results/algo1/taus_over_prods.html")
print("The plot has been saved in the folder data/results/algo1")
def tau_over_time(df):
x1 = df['tau'][::-1].tolist()
y1 = df['time'].tolist()
fig = go.Figure(data=go.Scatter(x=x1, y=y1, mode='lines+markers'),
layout=go.Layout(title='Time needed for the convergence', xaxis_title='tau', yaxis_title='time (seconds)'))
# save the plot in a html file
fig.write_html("../data/results/algo1/taus_over_time.html")
print("The plot has been saved in the folder data/results/algo1")
class Algorithms:
# Power method adapted to the PageRank problem with different damping factors. Referred as Algorithm 1 in the paper
def algo1(Pt, v, tau, max_mv, a: list):
start_time = time.time()
print("STARTING ALGORITHM 1...")
u = Pt @ v - v
mv = 1 # number of matrix-vector multiplications
r = sp.sparse.lil_matrix((n,1))
Res = sp.sparse.lil_matrix((len(a),1))
x = sp.sparse.lil_matrix((n,1))
for i in range(len(a)):
r = a[i]*(u)
normed_r = norm(r)
Res[i] = normed_r
if Res[i] > tau:
x = r + v
while max(Res) > tau and mv < max_mv:
u = Pt @ u
mv += 1
for i in range(len(a)):
if Res[i] >= tau:
r = (a[i]**(mv+1))*(u)
Res[i] = norm(r)
if Res[i] > tau:
x = r + x
if mv == max_mv:
print("The algorithm didn't converge in ", max_mv, " iterations")
else:
print("The algorithm converged with ", mv, " matrix-vector multiplications executed")
total_time = time.time() - start_time
total_time = round(total_time, 2)
print("The algorithm took ", total_time, " seconds to run\n")
return mv, x, r, total_time
# Refers to Algorithm 2 in the paper
def Arnoldi(A,v0,m):
v = v0
beta = norm(v)
v = v/beta
H = sp.sparse.lil_matrix((m+1,m))
V = sp.sparse.lil_matrix((A.shape[0],m+1))
V[:,0] = v # each column of V is a vector v
for j in range(m):
# print("j = ", j)
w = A @ v
for i in range(j):
tmp = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory
H[i,j] = tmp[0,0]
w = w - H[i,j]*v
H[j+1,j] = norm(w)
if H[j+1,j] == 0:
print("Arnoldi breakdown")
m = j
v = 0
break
else:
if j < m-1:
v = w/H[j+1,j]
V[:,j+1] = v
print(j, " iterations completed")
print("V = ", V.shape)
print("H = ", H.shape)
print("v = ", v.shape)
print("beta = ", beta)
return V, H, v, beta, j
def algo4():
# TO DO
pass
class runners:
def ShiftedPowerMethod(tau):
dataset = Utilities.load_data()
max_mv = 100
G, n = Utilities.create_graph(dataset)
P = Utilities.create_matrix(G)
d = Utilities.dangling_nodes(P,n)
v = Utilities.probability_vector(n)
Pt = Utilities.transition_matrix(P, v, d)
a = Utilities.alpha()
mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a)
print("total time = ", total_time)
# pandas dataframe to store the results
df = pd.DataFrame(columns=['alpha', 'products m-v', 'tau', 'time'])
# Main
if __name__ == "__main__":
dataset = Utilities.load_data()
# maximum number of iterations, asked to the user
max_mv = int(input("\nInsert the maximum number of matrix-vector operations: "))
G, n = Utilities.create_graph(dataset)
P = Utilities.create_matrix(G)
d = Utilities.dangling_nodes(P,n)
v = Utilities.probability_vector(n)
Pt = Utilities.transition_matrix(P, v, d)
a = Utilities.alpha()
# run the algorithm for different values of tau from 10^-5 to 10^-9 with step 10^-1
for i in range(5,10):
tau = 10**(-i)
print("\ntau = ", tau)
mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a)
# store the results in the dataframe
df = df.append({'alpha': a, 'products m-v': mv, 'tau': tau, 'time': total_time}, ignore_index=True)
# save the results in a csv file
df.to_csv('../data/results/algo1/different_tau.csv', index=False)
# plot the results
Plotting.tau_over_iterations(df)
Plotting.tau_over_time(df)
# print in the terminal the columns of the dataframe iterations, tau and time
print("Computations done. Here are the results:")
print("\n", df[['products m-v', 'tau', 'time']])
Loading…
Cancel
Save