diff --git a/src/algo1_testing.ipynb b/src/algo1_testing.ipynb index f5f94a9..3629dec 100644 --- a/src/algo1_testing.ipynb +++ b/src/algo1_testing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -148,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -160,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -183,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -193,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -242,116 +242,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "tau = 1e-05\n", - "The algorithm converged in 19 iterations\n", - "The algorithm took 12.630054473876953 seconds\n", - "\n", - "tau = 1e-06\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", - " df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The algorithm converged in 56 iterations\n", - "The algorithm took 32.28381681442261 seconds\n", - "\n", - "tau = 1e-07\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", - " df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The algorithm converged in 113 iterations\n", - "The algorithm took 75.9386818408966 seconds\n", - "\n", - "tau = 1e-08\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", - " df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The algorithm converged in 275 iterations\n", - "The algorithm took 107.68565726280212 seconds\n", - "\n", - "tau = 1e-09\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", - " df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The algorithm converged in 454 iterations\n", - "The algorithm took 152.8943109512329 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", - " df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n" - ] - }, - { - "ename": "OSError", - "evalue": "Cannot save file into a non-existent directory: '../data/algo1'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_1733107/908708813.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;31m# save the results in a csv file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/algo1/different_tau.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mto_csv\u001b[0;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, decimal, errors, storage_options)\u001b[0m\n\u001b[1;32m 3549\u001b[0m )\n\u001b[1;32m 3550\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3551\u001b[0;31m return DataFrameRenderer(formatter).to_csv(\n\u001b[0m\u001b[1;32m 3552\u001b[0m \u001b[0mpath_or_buf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3553\u001b[0m \u001b[0mline_terminator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mline_terminator\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/formats/format.py\u001b[0m in \u001b[0;36mto_csv\u001b[0;34m(self, path_or_buf, encoding, sep, columns, index_label, mode, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, errors, storage_options)\u001b[0m\n\u001b[1;32m 1178\u001b[0m \u001b[0mformatter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfmt\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1179\u001b[0m )\n\u001b[0;32m-> 1180\u001b[0;31m \u001b[0mcsv_formatter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1182\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcreated_buffer\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/formats/csvs.py\u001b[0m in \u001b[0;36msave\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 239\u001b[0m \"\"\"\n\u001b[1;32m 240\u001b[0m \u001b[0;31m# apply compression and byte/text conversion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 241\u001b[0;31m with get_handle(\n\u001b[0m\u001b[1;32m 242\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 692\u001b[0m \u001b[0;31m# Only for write methods\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 693\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"r\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mis_path\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 694\u001b[0;31m \u001b[0mcheck_parent_directory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 695\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 696\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mcheck_parent_directory\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 566\u001b[0m \u001b[0mparent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 567\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mparent\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 568\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mOSError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mrf\"Cannot save file into a non-existent directory: '{parent}'\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 569\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 570\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mOSError\u001b[0m: Cannot save file into a non-existent directory: '../data/algo1'" - ] - } - ], + "outputs": [], "source": [ "# list of alpha values, from 0.85 to 0.99 with step 0.01\n", "a = []\n", @@ -371,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -388,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -430,6 +323,57 @@ "\n", "_In the right folder_" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def Arnoldi(A, v, m): # defined ad algorithm 2 in the paper\n", + " beta = norm(v)\n", + " print(\"A\")\n", + " v = v/beta\n", + " print(\"B\")\n", + " h = sp.sparse.lil_matrix((m,m))\n", + " print(\"C\")\n", + "\n", + " for j in range(1,m):\n", + " w = A.dot(v)\n", + " for i in range(1,j):\n", + " h[i,j] = v.T.dot(w)\n", + " w = w - h[i,j]*v\n", + "\n", + " h[j+1,j] = norm(w)\n", + "\n", + " if h[j+1,j] == 0:\n", + " m = j\n", + " v[m+1] = 0\n", + " break\n", + " else:\n", + " v = w**h[j+1,j]\n", + "\n", + " return v, h, m, beta, j" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "A = sp.sparse.rand(100,100, density=0.5, format='lil')\n", + "v = sp.sparse.rand(100,1, density=1, format='lil')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "v, h, m, beta, j = Arnoldi(A, v, 100)" + ] } ], "metadata": { diff --git a/src/main.py b/src/main.py new file mode 100755 index 0000000..5fed57a --- /dev/null +++ b/src/main.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +# Importing the libraries +import os +import wget +import gzip +import time +import warnings +import scipy as sp +import numpy as np +import pandas as pd +import networkx as nx +import plotly.graph_objs as go +from scipy.sparse import * +from scipy.sparse.linalg import norm +from os.path import exists + + +warnings.simplefilter(action='ignore', category=FutureWarning) +# some stupid pandas function that doesn't work + +class utilities: + # Importing the dataset + def load_data(): + # Loading the dataset + dataset = int(input("Choose the dataset:\n [1] web-Stanford\n [2] web-BerkStan: \nEnter an option: ")) + + if dataset == 1: + if exists('../data/web-Stanford.txt'): + dataset = '../data/web-Stanford.txt' + else: + print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-Stanford.html") + + # if there is no folder data, create it + if not exists('../data'): + os.makedirs('../data') + + # Downloading the dataset + url = 'https://snap.stanford.edu/data/web-Stanford.txt.gz' + wget.download(url, '../data/web-Stanford.txt.gz') + # Unzipping the dataset + with gzip.open('../data/web-Stanford.txt.gz', 'rb') as f_in: + with open('../data/web-Stanford.txt', 'wb') as f_out: + f_out.write(f_in.read()) + + dataset = '../data/web-Stanford.txt' + print("\nDataset downloaded\n") + + elif dataset == 2: + if exists('../data/web-BerkStan.txt'): + dataset = '../data/web-BerkStan.txt' + else: + print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-BerkStan.html") + + # if there is no folder data, create it + if not exists('../data'): + os.makedirs('../data') + + # Downloading the dataset + url = 'https://snap.stanford.edu/data/web-BerkStan.txt.gz' + wget.download(url, '../data/web-BerkStan.txt.gz') + # Unzipping the dataset + with gzip.open('../data/web-BerkStan.txt.gz', 'rb') as f_in: + with open('../data/web-BerkStan.txt', 'wb') as f_out: + f_out.write(f_in.read()) + + dataset = '../data/web-BerkStan.txt' + print("\nDataset downloaded\n") + + return dataset + + # Creating the graph from the dataset + def create_graph(dataset): + print("\nCreating the graph...") + G = nx.read_edgelist(dataset, create_using=nx.DiGraph(), nodetype=int) + n = G.number_of_nodes() + print("Graph created based on the dataset\n") + return G, n + + # # Creating the transition probability matrix + # The matrix is filled with zeros and the (i,j) element is x if the node i is connected to the node j. Where x is 1/(number of nodes connected to i). + def create_matrix(G): + print("Creating the transition probability matrix...") + P = sp.sparse.lil_matrix((n,n)) + for i in G.nodes(): + for j in G[i]: #G[i] is the list of nodes connected to i, it's neighbors + P[i-1,j-1] = 1/len(G[i]) + print("Transition probability matrix created\n") + return P + + # The vector is filled with d(i) = 1 if the i row of the matrix P is filled with zeros, other wise is 0 + def dangling_nodes(P,n): + print("Creating the list of dangling nodes...") + d = sp.sparse.lil_matrix((n,1)) + for i in range(n): + if P[i].sum() == 0: + d[i] = 1 + print("List of dangling nodes created\n") + return d + + def probability_vector(n): + print("Creating the probability vector...") + v = sp.sparse.lil_matrix((n,1)) + for i in range(n): + v[i] = 1/n + print("Probability vector created\n") + return v + + def transition_matrix(P, v, d): + print("Creating the transition matrix...") + Pt = P + v.dot(d.T) + print("Transition matrix created\n") + return Pt + + def alpha(): + a = [] + for i in range(85,100): + a.append(i/100) + return a + +class Plotting: + def tau_over_iterations(dataframe): + dataframe = df + x = df['tau'][::-1].tolist() + y = df['iterations'].tolist() + + fig = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'), + layout=go.Layout(title='Iterations needed for the convergence', xaxis_title='tau', yaxis_title='iterations')) + + # save the figure as a html file + fig.write_html("../data/results/algo1/taus_over_iterations.html") + print("The plot has been saved in the folder data/results/algo1") + + def tau_over_time(df): + x1 = df['tau'][::-1].tolist() + y1 = df['time'].tolist() + + fig = go.Figure(data=go.Scatter(x=x1, y=y1, mode='lines+markers'), + layout=go.Layout(title='Time needed for the convergence', xaxis_title='tau', yaxis_title='time (seconds)')) + + # save the plot in a html file + fig.write_html("../data/results/algo1/taus_over_time.html") + print("The plot has been saved in the folder data/results/algo1") + +class Algorithms: + def algo1(Pt, v, tau, max_mv, a: list): + start_time = time.time() + + print("STARTING ALGORITHM 1...") + u = Pt.dot(v) - v + mv = 1 # number of iteration + r = sp.sparse.lil_matrix((n,1)) + Res = sp.sparse.lil_matrix((len(a),1)) + x = sp.sparse.lil_matrix((n,1)) + + for i in range(len(a)): + r = a[i]*(u) + normed_r = norm(r) + Res[i] = normed_r + + if Res[i] > tau: + x = r + v + + while max(Res) > tau and mv < max_mv: + u = Pt*u # should it be the same u of the beginning? + mv += 1 + + for i in range(len(a)): + if Res[i] >= tau: + r = (a[i]**(mv+1))*(u) + Res[i] = norm(r) + + if Res[i] > tau: + x = r + x + + if mv == max_mv: + print("The algorithm didn't converge in ", max_mv, " iterations") + else: + print("The algorithm converged in ", mv, " iterations") + + total_time = time.time() - start_time + total_time = round(total_time, 2) + + print("The algorithm took ", total_time, " seconds to run\n") + + return mv, x, r, total_time + +# pandas dataframe to store the results +df = pd.DataFrame(columns=['alpha', 'iterations', 'tau', 'time']) + +# Main +if __name__ == "__main__": + dataset = utilities.load_data() + # maximum number of iterations, asked to the user + max_mv = int(input("Insert the maximum number of iterations: ")) + + G, n = utilities.create_graph(dataset) + P = utilities.create_matrix(G) + d = utilities.dangling_nodes(P,n) + v = utilities.probability_vector(n) + Pt = utilities.transition_matrix(P, v, d) + a = utilities.alpha() + + # run the algorithm for different values of tau from 10^-5 to 10^-9 with step 10^-1 + for i in range(5,10): + tau = 10**(-i) + print("\ntau = ", tau) + mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a) + + # store the results in the dataframe + df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True) + + # save the results in a csv file + df.to_csv('../data/results/algo1/different_tau.csv', index=False) + + # plot the results + Plotting.tau_over_iterations(df) + Plotting.tau_over_time(df) + + # print in the terminal the columns of the dataframe iterations, tau and time + print("\n", df[['iterations', 'tau', 'time']])