minor fixes

2 years ago · 097fa7cf9f
parent 3e08fee18d
commit 097fa7cf9f
5 changed files with 565321 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -275,4 +275,6 @@ TSWLatexianTemp*
 # Makeindex log files
 *.lpz
-data/*.txt
+data/
 __pycache__/
--- a/tex/main.pdf
+++ b/tex/main.pdf
--- a/tex/main.tex
+++ b/tex/main.tex
@ -35,7 +35,7 @@ for solving PageRank with multiple damping factors}
 \maketitle
 \begin{abstract}
-    \noindent In the years following its publication in 1968, the PageRank model has been studied deeply to be extended in fields such as chemistry, biology and social network analysis. The aim of this project is the implementation of a modified version of the Power method to solve the PageRank problem with multiple damping factors. The proposed method is based on the combination of the Power method with the shifted \texttt{GMRES} method.
+    \noindent In the years following its publication in 1998, the PageRank model has been studied deeply to be extended in fields such as chemistry, biology and social network analysis. The aim of this project is the implementation of a modified version of the Power method to solve the PageRank problem with multiple damping factors. The proposed method is based on the combination of the Power method with the shifted \texttt{GMRES} method.
    %  During the years since it's first version, the PageRank model has been extended to many fields far beyond search engine rankings, such as chemistry, biology, bioinformatics, social network analysis, to name a few. Due to the large dimension of PageRank problems, in the past decade or so, considerable research efforts have been devoted to their efficient solution especially for the difficult cases where the damping factors are close to 1. However, there exists few research work concerning about the solution of the case where several PageRank problems with the same network structure and various damping factors need to be solved. In this paper, we generalize the Power method to solving the PageRank problem with multiple damping factors. We demonstrate that the solution has almost the equative cost of solving the most difficult PageRank system of the sequence, and the residual vectors of the PageRank systems after running this method are collinear. Based upon these results, we develop a more efficient method that combines this Power method with the shifted GMRES method. For further accelerating the solving phase, we present a seed system choosing strategy combined with an extrapolation technique, and analyze their effect. Numerical experiments demonstrate the potential of the proposed iterative solver for accelerating realistic PageRank computations with multiple damping factors.
 \end{abstract}
--- a/tmp/deprecated_main.py
+++ b/tmp/deprecated_main.py
@ -0,0 +1,286 @@
 #!/usr/bin/env python3
 # Importing the libraries
 import os
 import wget
 import gzip
 import time
 import warnings
 import scipy as sp
 import numpy as np
 import pandas as pd
 import networkx as nx
 from os.path import exists
 from scipy.sparse import *
 from scipy.sparse.linalg import norm
 import plotly.graph_objs as go
 warnings.simplefilter(action='ignore', category=FutureWarning)
 # some stupid pandas function that doesn't work
 class Utilities:
    # Importing the dataset
    def load_data():
        # Loading the dataset
        dataset = int(input("Choose the dataset:\n  [1] web-Stanford (use this one for now)\n  [2] web-BerkStan: \nEnter an option: "))
        if dataset == 1:
            if exists('../data/web-Stanford.txt'):
                dataset = '../data/web-Stanford.txt'
            else:
                print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-Stanford.html")
                # if there is no folder data, create it
                if not exists('../data'):
                    os.makedirs('../data')
                # Downloading the dataset
                url = 'https://snap.stanford.edu/data/web-Stanford.txt.gz'
                wget.download(url, '../data/web-Stanford.txt.gz')
                # Unzipping the dataset
                with gzip.open('../data/web-Stanford.txt.gz', 'rb') as f_in:
                    with open('../data/web-Stanford.txt', 'wb') as f_out:
                        f_out.write(f_in.read())
                # delete the zipped file
                os.remove('../data/web-Stanford.txt.gz')
                dataset = '../data/web-Stanford.txt'
                print("\nDataset downloaded\n")
        elif dataset == 2:
            if exists('../data/web-BerkStan.txt'):
                dataset = '../data/web-BerkStan.txt'
            else:
                print("\nThe file doesn't exist, download it from https://snap.stanford.edu/data/web-BerkStan.html")
                # if there is no folder data, create it
                if not exists('../data'):
                    os.makedirs('../data')
                # Downloading the dataset
                url = 'https://snap.stanford.edu/data/web-BerkStan.txt.gz'
                wget.download(url, '../data/web-BerkStan.txt.gz')
                # Unzipping the dataset
                with gzip.open('../data/web-BerkStan.txt.gz', 'rb') as f_in:
                    with open('../data/web-BerkStan.txt', 'wb') as f_out:
                        f_out.write(f_in.read())
                # delete the zipped file
                os.remove('../data/web-BerkStan.txt.gz')
                dataset = '../data/web-BerkStan.txt'
                print("\nDataset downloaded\n")
        return dataset
    # Creating the graph from the dataset
    def create_graph(dataset):
        print("\nCreating the graph...")
        G = nx.read_edgelist(dataset, create_using=nx.DiGraph(), nodetype=int)
        n = G.number_of_nodes()
        print("Graph created based on the dataset\n")
        return G, n
    # The matrix is filled with zeros and the (i,j) element is x if the node i is connected to the node j. Where x is 1/(number of nodes connected to i).
    def create_matrix(G):
        print("Creating the transition probability matrix...")
        P = sp.sparse.lil_matrix((n,n))
        for i in G.nodes():
            for j in G[i]: #G[i] is the list of nodes connected to i, it's neighbors
                P[i-1,j-1] = 1/len(G[i])
        print("Transition probability matrix created\n")
        return P
    # The vector is filled with d(i) = 1 if the i row of the matrix P is filled with zeros, other wise is 0
    def dangling_nodes(P,n):
        print("Creating the list of dangling nodes...")
        d = sp.sparse.lil_matrix((n,1))
        for i in range(n):
            if P[i].sum() == 0:
                d[i] = 1
        print("List of dangling nodes created\n")
        return d
    # For now it is set to equally distribute the probability to all the nodes
    def probability_vector(n):
        print("Creating the probability vector...")
        v = sp.sparse.lil_matrix((n,1))
        for i in range(n):
            v[i] = 1/n
        print("Probability vector created\n")
        return v
    def transition_matrix(P, v, d):
        print("Creating the transition matrix...")
        Pt = P + v @ (d.T)
        print("Transition matrix created\n")
        return Pt
    # it can vary from 0 to 1, the higher the value the more the probability to jump to a random page
    def alpha():
        a = []
        for i in range(85,100):
            a.append(i/100)
        return a
 # Class for plotting the results obtained. For now it can only the first algorithm. To be updated once all the algorithms are implemented and the test cases are well defined
 class Plotting:
    def tau_over_iterations(df):
        x = df['tau'][::-1].tolist()
        y = df['products m-v'].tolist()
        fig = go.Figure(data=go.Scatter(x=x, y=y, mode='lines+markers'),
                                    layout=go.Layout(title='products needed for the convergence', xaxis_title='tau', yaxis_title='products matrix vector'))
        # save the figure as a html file
        fig.write_html("../data/results/algo1/taus_over_prods.html")
        print("The plot has been saved in the folder data/results/algo1")
    def tau_over_time(df):
        x1 = df['tau'][::-1].tolist()
        y1 = df['time'].tolist()
        fig = go.Figure(data=go.Scatter(x=x1, y=y1, mode='lines+markers'),
                                        layout=go.Layout(title='Time needed for the convergence', xaxis_title='tau', yaxis_title='time (seconds)'))
        # save the plot in a html file
        fig.write_html("../data/results/algo1/taus_over_time.html")
        print("The plot has been saved in the folder data/results/algo1")
 class Algorithms:
    # Power method adapted to the PageRank problem with different damping factors. Referred as Algorithm 1 in the paper
    def algo1(Pt, v, tau, max_mv, a: list):
        start_time = time.time()
        print("STARTING ALGORITHM 1...")
        u = Pt @ v - v
        mv = 1 # number of matrix-vector multiplications
        r = sp.sparse.lil_matrix((n,1))
        Res = sp.sparse.lil_matrix((len(a),1))
        x = sp.sparse.lil_matrix((n,1))
        for i in range(len(a)):
            r = a[i]*(u)
            normed_r = norm(r)
            Res[i] = normed_r
            if Res[i] > tau:
                x = r + v
        while max(Res) > tau and mv < max_mv:
            u = Pt @ u
            mv += 1
            for i in range(len(a)):
                if Res[i] >= tau:
                    r = (a[i]**(mv+1))*(u)
                    Res[i] = norm(r)
                    if Res[i] > tau:
                        x = r + x
        if mv == max_mv:
            print("The algorithm didn't converge in ", max_mv, " iterations")
        else:
            print("The algorithm converged with ", mv, " matrix-vector multiplications executed")
        total_time = time.time() - start_time
        total_time = round(total_time, 2)
        print("The algorithm took ", total_time, " seconds to run\n")
        return mv, x, r, total_time
    # Refers to Algorithm 2 in the paper
    def Arnoldi(A,v0,m):
        v = v0
        beta = norm(v)
        v = v/beta
        H = sp.sparse.lil_matrix((m+1,m))
        V = sp.sparse.lil_matrix((A.shape[0],m+1))
        V[:,0] = v # each column of V is a vector v
        for j in range(m):
            # print("j = ", j)
            w = A @ v
            for i in range(j):
                tmp = v.T @ w # tmp is a 1x1 matrix, so it's O(1) in memory
                H[i,j] = tmp[0,0]
                w = w - H[i,j]*v
            H[j+1,j] = norm(w)
            if H[j+1,j] == 0:
                print("Arnoldi breakdown")
                m = j
                v = 0
                break
            else:
                if j < m-1:
                    v = w/H[j+1,j]
                    V[:,j+1] = v
        print(j, " iterations completed")
        print("V = ", V.shape)
        print("H = ", H.shape)
        print("v = ", v.shape)
        print("beta = ", beta)
        return V, H, v, beta, j
    def algo4():
        # TO DO
        pass
 class runners:
    def ShiftedPowerMethod(tau):
        dataset = Utilities.load_data()
        max_mv = 100
        G, n = Utilities.create_graph(dataset)
        P = Utilities.create_matrix(G)
        d = Utilities.dangling_nodes(P,n)
        v = Utilities.probability_vector(n)
        Pt = Utilities.transition_matrix(P, v, d)
        a = Utilities.alpha()
        mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a)
        print("total time = ", total_time)
 # pandas dataframe to store the results
 df = pd.DataFrame(columns=['alpha', 'products m-v', 'tau', 'time'])
 # Main
 if __name__ == "__main__":
    dataset = Utilities.load_data()
    # maximum number of iterations, asked to the user
    max_mv = int(input("\nInsert the maximum number of matrix-vector operations: "))
    G, n = Utilities.create_graph(dataset)
    P = Utilities.create_matrix(G)
    d = Utilities.dangling_nodes(P,n)
    v = Utilities.probability_vector(n)
    Pt = Utilities.transition_matrix(P, v, d)
    a = Utilities.alpha()
    # run the algorithm for different values of tau from 10^-5 to 10^-9 with step 10^-1
    for i in range(5,10):
        tau = 10**(-i)
        print("\ntau = ", tau)
        mv, x, r, total_time = Algorithms.algo1(Pt, v, tau, max_mv, a)
        # store the results in the dataframe
        df = df.append({'alpha': a, 'products m-v': mv, 'tau': tau, 'time': total_time}, ignore_index=True)
    # save the results in a csv file
    df.to_csv('../data/results/algo1/different_tau.csv', index=False)
    # plot the results
    Plotting.tau_over_iterations(df)
    Plotting.tau_over_time(df)
    # print in the terminal the columns of the dataframe iterations, tau and time
    print("Computations done. Here are the results:")
    print("\n", df[['products m-v', 'tau', 'time']])
--- a/tmp/testing.ipynb
+++ b/tmp/testing.ipynb