From 79416d93d6460b7d5afeb0e5f70af2b0d0dd31e1 Mon Sep 17 00:00:00 2001 From: Luca Lombardo Date: Mon, 7 Feb 2022 20:33:30 +0100 Subject: [PATCH] documentation --- README.md | 37 ++++++++++++++++++++++++++++++------- filtro.py | 5 +++-- kenobi.cpp | 10 +++++----- 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 9fe5793..e2c8e56 100644 --- a/README.md +++ b/README.md @@ -267,12 +267,12 @@ Then we exclude the add with `.push_back` this two integers at the end of the ve That's where I tried to experiment a little bit. The original idea to optimize the algorithm was to take a uniformly random subset of actors. This method has a problem: no matter how smart you take this _random_ subset, you are going to exclude some important actors. And I would never want to exclude Ewan McGregor from something! -So I found this [paper](https://arxiv.org/abs/1704.01077) and I decided that this where the way to go +So I found this [paper](https://arxiv.org/abs/1704.01077) and I decided that this was the way to go ### The problem Given a connected graph $G = (V, E)$, the closeness centrality of a vertex $v$ is defined as -$$ \frac{n-1}{\displaystyle \sum_{\omega \in V} d(v,w)} $$ +$$ C(v) = \frac{n-1}{\displaystyle \sum_{\omega \in V} d(v,w)} $$ The idea behind this definition is that a central node should be very efficient in spreading information to all other nodes: for this reason, a node is central if the average number of links @@ -288,7 +288,7 @@ In order to compute the $k$ vertices with largest closeness, the textbook algori $c(v)$ for each $v$ and returns the $k$ largest found values. The main bottleneck of this approach is the computation of $d(v, w)$ for each pair of vertices $v$ and $w$ (that is, solving the All Pairs Shortest Paths or APSP problem). This can be done in two ways: either by using fast -matrix multiplication, in time $O(n^{2.373} \log n)$ _[Zwick 2002; Williams 2012]_, or by performing _a breadth-first search_ (in short, BFS) from each vertex $v \in V$ , in time $O(mn)$, where $n = |V|$ and $m = |E|$. Usually, the BFS approach is preferred because the other approach contains big constants hidden in the O notation, and because real-world networks are usually sparse, that is, $m$ is not much bigger than n$$. However, also this approach is too time-consuming if the input graph is very big +matrix multiplication, in time $O(n^{2.373} \log n)$ _[Zwick 2002; Williams 2012]_, or by performing _a breadth-first search_ (in short, BFS) from each vertex $v \in V$ , in time $O(mn)$, where $n = |V|$ and $m = |E|$. Usually, the BFS approach is preferred because the other approach contains big constants hidden in the O notation, and because real-world networks are usually sparse, that is, $m$ is not much bigger than $n$. However, also this approach is too time-consuming if the input graph is very big ### Preliminaries @@ -317,8 +317,9 @@ and hence $f(w) \leq L(w) < f (w) ~ ~ \forall w \in V \setminus \{v_1, ..., v_l Let's write the Algorithm in pseudo-code, but keep in mind that we will modify it a little bit during the real code. ```cpp - Input : A graph G = (V, E) - Output: Top k nodes with highest closeness and their closeness values c(v) +Input : A graph G = (V, E) +Output: Top k nodes with highest closeness and their closeness values c(v) + global L, Q ← computeBounds(G); global Top ← [ ]; global Farn; @@ -348,7 +349,11 @@ The crucial point of the algorithm is the definition of the lower bounds, that i What we are changing in this code is that since $L=0$ is never updated, we do not need to definite it. We will just loop over each vertex, in the order the map prefers. We do not need to define `Q` either, as we will loop over each vertex anyway, and the order does not matter. -#### Multi-threaded implementation +The lower bound is + +$$ \frac{1}{n-1} (\sigma_{d-1} + n_d \cdot d) $$ + + + +## Results + +Tested on Razer Blade 15 (2018) with an i7-8750H (6 core, 12 thread) and 16GB of DDR4 2666MHz RAM. The algorithm is taking full advantage of all 12 threads + +| MIN_ACTORS | k | Time for filtering | Time to compile | +|------------|---|--------------------|-----------------| +|42 | 100 | 1m 30s | 3m 48s| +|31 | 100 | 1m 44s | 8m 14s| +|20 | 100 | 3m | 19m 34s| + + +How the files changes in relation to MIN_ACTORS + +| MIN_ACTORS | Attori.txt elements | FilmFiltrati.txt elements | Relazioni.txt elements | +|------------|---------------------|---------------------------|------------------------| +| 42 | 7921 | 266337 | 545848 | +| 31 | 13632 | 325087 | 748580 | diff --git a/filtro.py b/filtro.py index 55802ca..da0fbe7 100755 --- a/filtro.py +++ b/filtro.py @@ -5,7 +5,7 @@ import numpy as np import os import csv -MIN_MOVIES = 30 # Only keep relations for actors that have made more than this many movies +MIN_MOVIES = 42 # Only keep relations for actors that have made more than this many movies #-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------# @@ -79,4 +79,5 @@ df_attori.to_csv('data/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar df_film.to_csv('data/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False) df_relazioni.to_csv('data/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False) -# Takes about 1 min 30 s with MIN_MOVIES = 42 +# Takes about 1 min 30 s with MIN_MOVIES = 42 ----> kenobi with k=100 took 3m 48s +# Takes about 3 min with MIN_MOVIES = 20 ----> kenobi with k=100 took 19m 34s diff --git a/kenobi.cpp b/kenobi.cpp index 089d3a8..e493819 100644 --- a/kenobi.cpp +++ b/kenobi.cpp @@ -189,7 +189,7 @@ vector> closeness(const size_t k) { mutex top_actors_mutex; // The threads write to top_actors, so another thread reading top_actors at the same time may find it in an invalid state (if the read happens while the other thread is still writing) threads.reserve(N_THREADS); for (int i = 0; i < N_THREADS; i++) { - // Lancio i thread + // Launching the threads threads.push_back(thread([&top_actors,&top_actors_mutex,&k](int start) { vector enqueued(MAX_ACTOR_ID, false); // Vector to see which vertices with put in the queue during the BSF // We loop over each vertex @@ -265,7 +265,7 @@ vector> closeness(const size_t k) { } for (auto& thread : threads) - // Aspetto che tutti i thread abbiano finito + // Waiting for all threads to finish thread.join(); return top_actors; @@ -332,7 +332,7 @@ vector> harmonic(const size_t k) { // cout << actor_id << " " << A[actor_id].name << " SKIPPED" << endl; continue; } - // BFS is over, we compute the farness + // BFS is over, we compute the centrality double harmonic_centrality = sum_reverse_distances; if (!isfinite(harmonic_centrality)) continue; @@ -369,7 +369,7 @@ int main() // ------------------------------------------------------------- // - // FUNZIONE CERCA FILMclos + // FUNZIONE CERCA FILM // cout << "Cerca film: "; // string titolo; @@ -400,7 +400,7 @@ int main() // ------------------------------------------------------------- // cout << "Grafo, grafo delle mie brame... chi è il più centrale del reame?\n" <