refinend the algo1, starting to test algo2

2 years ago · aa7364c261
parent 990d59e3ab
commit aa7364c261
2 changed files with 287 additions and 122 deletions
--- a/src/algo1_testing.ipynb
+++ b/src/algo1_testing.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -27,7 +27,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -45,7 +45,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -77,7 +77,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -94,7 +94,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -116,7 +116,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -137,7 +137,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -148,7 +148,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -160,7 +160,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -183,7 +183,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -193,7 +193,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -242,116 +242,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "tau =  1e-05\n",
      "The algorithm converged in  19  iterations\n",
      "The algorithm took  12.630054473876953  seconds\n",
      "\n",
      "tau =  1e-06\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
      "  df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The algorithm converged in  56  iterations\n",
      "The algorithm took  32.28381681442261  seconds\n",
      "\n",
      "tau =  1e-07\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
      "  df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The algorithm converged in  113  iterations\n",
      "The algorithm took  75.9386818408966  seconds\n",
      "\n",
      "tau =  1e-08\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
      "  df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The algorithm converged in  275  iterations\n",
      "The algorithm took  107.68565726280212  seconds\n",
      "\n",
      "tau =  1e-09\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
      "  df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The algorithm converged in  454  iterations\n",
      "The algorithm took  152.8943109512329  seconds\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1733107/908708813.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
      "  df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)\n"
     ]
    },
    {
     "ename": "OSError",
     "evalue": "Cannot save file into a non-existent directory: '../data/algo1'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_1733107/908708813.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0;31m# save the results in a csv file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/algo1/different_tau.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mto_csv\u001b[0;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, decimal, errors, storage_options)\u001b[0m\n\u001b[1;32m   3549\u001b[0m         )\n\u001b[1;32m   3550\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3551\u001b[0;31m         return DataFrameRenderer(formatter).to_csv(\n\u001b[0m\u001b[1;32m   3552\u001b[0m             \u001b[0mpath_or_buf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3553\u001b[0m             \u001b[0mline_terminator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mline_terminator\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/formats/format.py\u001b[0m in \u001b[0;36mto_csv\u001b[0;34m(self, path_or_buf, encoding, sep, columns, index_label, mode, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, errors, storage_options)\u001b[0m\n\u001b[1;32m   1178\u001b[0m             \u001b[0mformatter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfmt\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1179\u001b[0m         )\n\u001b[0;32m-> 1180\u001b[0;31m         \u001b[0mcsv_formatter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1182\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mcreated_buffer\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/formats/csvs.py\u001b[0m in \u001b[0;36msave\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    239\u001b[0m         \"\"\"\n\u001b[1;32m    240\u001b[0m         \u001b[0;31m# apply compression and byte/text conversion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 241\u001b[0;31m         with get_handle(\n\u001b[0m\u001b[1;32m    242\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    243\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    692\u001b[0m     \u001b[0;31m# Only for write methods\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    693\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;34m\"r\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mis_path\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 694\u001b[0;31m         \u001b[0mcheck_parent_directory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    695\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    696\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mcheck_parent_directory\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m    566\u001b[0m     \u001b[0mparent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    567\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mparent\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 568\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mOSError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mrf\"Cannot save file into a non-existent directory: '{parent}'\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    569\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    570\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mOSError\u001b[0m: Cannot save file into a non-existent directory: '../data/algo1'"
     ]
    }
   ],
   "source": [
    "# list of alpha values, from 0.85 to 0.99 with step 0.01\n",
    "a = []\n",
@ -371,7 +264,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -388,7 +281,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -430,6 +323,57 @@
    "\n",
    "_In the right folder_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Arnoldi(A, v, m): #  defined ad algorithm 2 in the paper\n",
    "    beta = norm(v)\n",
    "    print(\"A\")\n",
    "    v = v/beta\n",
    "    print(\"B\")\n",
    "    h = sp.sparse.lil_matrix((m,m))\n",
    "    print(\"C\")\n",
    "\n",
    "    for j in range(1,m):\n",
    "        w = A.dot(v)\n",
    "        for i in range(1,j):\n",
    "            h[i,j] = v.T.dot(w)\n",
    "            w = w - h[i,j]*v\n",
    "\n",
    "        h[j+1,j] = norm(w)\n",
    "\n",
    "        if h[j+1,j] == 0:\n",
    "            m = j\n",
    "            v[m+1] = 0\n",
    "            break\n",
    "        else:\n",
    "            v = w**h[j+1,j]\n",
    "\n",
    "    return v, h, m, beta, j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "A = sp.sparse.rand(100,100, density=0.5, format='lil')\n",
    "v = sp.sparse.rand(100,1, density=1, format='lil')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "v, h, m, beta, j = Arnoldi(A, v, 100)"
   ]
  }
 ],
 "metadata": {
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,221 @@
 #!/usr/bin/env python3
 # Importing the libraries
 import os
 import wget
 import gzip
 import time
 import warnings
 import scipy as sp
 import numpy as np
 import pandas as pd
 import networkx as nx
 import plotly.graph_objs as go
 from scipy.sparse import *
 from scipy.sparse.linalg import norm
 from os.path import exists
 warnings.simplefilter(action='ignore', category=FutureWarning)
 # some stupid pandas function that doesn't work
 class utilities:
    # Importing the dataset
    def load_data():
        # Loading the dataset
        dataset = int(input("Choose the dataset:\n  [1] web-Stanford\n  [2] web-BerkStan: \nEnter an option: "))
        if dataset == 1:
            if exists('../data/web-Stanford.txt'):
                dataset = '../data/web-Stanford.txt'
            else:
                print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-Stanford.html")
                # if there is no folder data, create it
                if not exists('../data'):
                    os.makedirs('../data')
                # Downloading the dataset
                url = 'https://snap.stanford.edu/data/web-Stanford.txt.gz'
                wget.download(url, '../data/web-Stanford.txt.gz')
                # Unzipping the dataset
                with gzip.open('../data/web-Stanford.txt.gz', 'rb') as f_in:
                    with open('../data/web-Stanford.txt', 'wb') as f_out:
                        f_out.write(f_in.read())
                dataset = '../data/web-Stanford.txt'
                print("\nDataset downloaded\n")
        elif dataset == 2:
            if exists('../data/web-BerkStan.txt'):
                dataset = '../data/web-BerkStan.txt'
            else:
                print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-BerkStan.html")
                # if there is no folder data, create it
                if not exists('../data'):
                    os.makedirs('../data')
                # Downloading the dataset
                url = 'https://snap.stanford.edu/data/web-BerkStan.txt.gz'
                wget.download(url, '../data/web-BerkStan.txt.gz')
                # Unzipping the dataset
                with gzip.open('../data/web-BerkStan.txt.gz', 'rb') as f_in:
                    with open('../data/web-BerkStan.txt', 'wb') as f_out:
                        f_out.write(f_in.read())
                dataset = '../data/web-BerkStan.txt'
                print("\nDataset downloaded\n")
        return dataset
    # Creating the graph from the dataset
    def create_graph(dataset):
        print("\nCreating the graph...")
        G = nx.read_edgelist(dataset, create_using=nx.DiGraph(), nodetype=int)
        n = G.number_of_nodes()
        print("Graph created based on the dataset\n")
        return G, n
    # # Creating the transition probability matrix
    # The matrix is filled with zeros and the (i,j) element is x if the node i is connected to the node j. Where x is 1/(number of nodes connected to i).
    def create_matrix(G):
        print("Creating the transition probability matrix...")
        P = sp.sparse.lil_matrix((n,n))
        for i in G.nodes():
            for j in G[i]: #G[i] is the list of nodes connected to i, it's neighbors
                P[i-1,j-1] = 1/len(G[i])
        print("Transition probability matrix created\n")
        return P
    # The vector is filled with d(i) = 1 if the i row of the matrix P is filled with zeros, other wise is 0
    def dangling_nodes(P,n):
        print("Creating the list of dangling nodes...")
        d = sp.sparse.lil_matrix((n,1))
        for i in range(n):
            if P[i].sum() == 0:
                d[i] = 1
        print("List of dangling nodes created\n")
        return d
    def probability_vector(n):
        print("Creating the probability vector...")
        v = sp.sparse.lil_matrix((n,1))
        for i in range(n):
            v[i] = 1/n
        print("Probability vector created\n")
        return v
    def transition_matrix(P, v, d):
        print("Creating the transition matrix...")
        Pt = P + v.dot(d.T)
        print("Transition matrix created\n")
        return Pt
    def alpha():
        a = []
        for i in range(85,100):
            a.append(i/100)
        return a
 class Plotting:
    def tau_over_iterations(dataframe):
        dataframe = df
        x = df['tau'][::-1].tolist()
        y = df['iterations'].tolist()
        fig = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'),
                                    layout=go.Layout(title='Iterations needed for the convergence', xaxis_title='tau', yaxis_title='iterations'))
        # save the figure as a html file
        fig.write_html("../data/results/algo1/taus_over_iterations.html")
        print("The plot has been saved in the folder data/results/algo1")
    def tau_over_time(df):
        x1 = df['tau'][::-1].tolist()
        y1 = df['time'].tolist()
        fig = go.Figure(data=go.Scatter(x=x1, y=y1, mode='lines+markers'),
                                        layout=go.Layout(title='Time needed for the convergence', xaxis_title='tau', yaxis_title='time (seconds)'))
        # save the plot in a html file
        fig.write_html("../data/results/algo1/taus_over_time.html")
        print("The plot has been saved in the folder data/results/algo1")
 class Algorithms:
    def algo1(Pt, v, tau, max_mv, a: list):
        start_time = time.time()
        print("STARTING ALGORITHM 1...")
        u = Pt.dot(v) - v
        mv = 1 # number of iteration
        r = sp.sparse.lil_matrix((n,1))
        Res = sp.sparse.lil_matrix((len(a),1))
        x = sp.sparse.lil_matrix((n,1))
        for i in range(len(a)):
            r = a[i]*(u)
            normed_r = norm(r)
            Res[i] = normed_r
            if Res[i] > tau:
                x = r + v
        while max(Res) > tau and mv < max_mv:
            u = Pt*u # should it be the same u of the beginning?
            mv += 1
            for i in range(len(a)):
                if Res[i] >= tau:
                    r = (a[i]**(mv+1))*(u)
                    Res[i] = norm(r)
                    if Res[i] > tau:
                        x = r + x
        if mv == max_mv:
            print("The algorithm didn't converge in ", max_mv, " iterations")
        else:
            print("The algorithm converged in ", mv, " iterations")
        total_time = time.time() - start_time
        total_time = round(total_time, 2)
        print("The algorithm took ", total_time, " seconds to run\n")
        return mv, x, r, total_time
 # pandas dataframe to store the results
 df = pd.DataFrame(columns=['alpha', 'iterations', 'tau', 'time'])
 # Main
 if __name__ == "__main__":
    dataset = utilities.load_data()
    # maximum number of iterations, asked to the user
    max_mv = int(input("Insert the maximum number of iterations: "))
    G, n = utilities.create_graph(dataset)
    P = utilities.create_matrix(G)
    d = utilities.dangling_nodes(P,n)
    v = utilities.probability_vector(n)
    Pt = utilities.transition_matrix(P, v, d)
    a = utilities.alpha()
    # run the algorithm for different values of tau from 10^-5 to 10^-9 with step 10^-1
    for i in range(5,10):
        tau = 10**(-i)
        print("\ntau = ", tau)
        mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a)
        # store the results in the dataframe
        df = df.append({'alpha': a, 'iterations': mv, 'tau': tau, 'time': total_time}, ignore_index=True)
    # save the results in a csv file
    df.to_csv('../data/results/algo1/different_tau.csv', index=False)
    # plot the results
    Plotting.tau_over_iterations(df)
    Plotting.tau_over_time(df)
    # print in the terminal the columns of the dataframe iterations, tau and time
    print("\n", df[['iterations', 'tau', 'time']])