new chapters

main
Luca Lombardo 3 years ago
parent 8eca30ecdf
commit d6291259d0

@ -6,7 +6,7 @@ import numpy as np
import os import os
import csv import csv
MIN_MOVIES = 42 # Only keep relations for actors that have made more than this many movies MIN_MOVIES = 70 # Only keep relations for actors that have made more than this many movies
#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------# #-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------#
def colored(r, g, b, text): def colored(r, g, b, text):
@ -29,8 +29,7 @@ def download_url(url):
urls = ["https://datasets.imdbws.com/name.basics.tsv.gz", urls = ["https://datasets.imdbws.com/name.basics.tsv.gz",
"https://datasets.imdbws.com/title.principals.tsv.gz", "https://datasets.imdbws.com/title.principals.tsv.gz",
"https://datasets.imdbws.com/title.basics.tsv.gz", "https://datasets.imdbws.com/title.basics.tsv.gz"]
"https://datasets.imdbws.com/title.ratings.tsv.gz"]
for url in urls: for url in urls:
download_url(url) download_url(url)

@ -61,8 +61,8 @@ df_film = pd.merge(df_film, df_ratings, "left", on="tconst")
del df_ratings del df_ratings
df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]', df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]',
inplace=True) inplace=True)
VOTES_MEAN = df_film['numVotes'].mean() VOTES = df_film['numVotes'].mean()
df_film.query('numVotes > @VOTES_MEAN', inplace=True) df_film.query('numVotes > @VOTES', inplace=True)
filtered_tconsts = df_film["tconst"].to_list() filtered_tconsts = df_film["tconst"].to_list()
print("Filtering relations...") print("Filtering relations...")

Binary file not shown.

@ -75,7 +75,7 @@ void DataRead()
void BuildGraph() void BuildGraph()
{ {
ifstream relations("data/Relazioni.txt"); ifstream relations("../data/data_actor_graph/Relazioni.txt");
string s; string s;
const string space = "\t"; const string space = "\t";
@ -224,7 +224,7 @@ vector<pair<int, double>> closeness(const size_t k) {
// Waiting for all threads to finish // Waiting for all threads to finish
thread.join(); thread.join();
ofstream output_file("visualization/top_actors_c.txt"); ofstream output_file("actor_bench/top_actors_c_70.txt");
for (const auto& [actor_id, farness] : top_actors) { for (const auto& [actor_id, farness] : top_actors) {
output_file << actor_id << "\t" << A[actor_id].name << "\t" << 1.0/farness << endl; output_file << actor_id << "\t" << A[actor_id].name << "\t" << 1.0/farness << endl;
} }
@ -315,7 +315,7 @@ vector<pair<int, double>> harmonic(const size_t k) { //
for (auto& thread : threads) for (auto& thread : threads)
thread.join(); thread.join();
ofstream output_file("visualization/top_actors_h.txt"); ofstream output_file("actor_bench/top_actors_h_70.txt");
for (const auto& [actor_id, harmonic] : top_actors) { for (const auto& [actor_id, harmonic] : top_actors) {
output_file << actor_id << "\t" << A[actor_id].name << "\t" << harmonic << endl; output_file << actor_id << "\t" << A[actor_id].name << "\t" << harmonic << endl;
} }

Binary file not shown.

@ -1,4 +1,5 @@
TO DO! Understanding and investigating social structures is essential in the modern world. Through the use of networks and graph theory we can find the most central elements in a community. In particolar, given a connected graph $G=(V,E)$, the closeness centrality of a vertex $v$ is defined as $ \frac{n-1}{\sum_{w \in V} d(v,w)}$. This measure can be seen as the efficiency of a node to pass information through all the other nodes in the graph. In this paper we will discuss and algorithm and its result for finding the top-k most central elements in web-scale graphs. As a case study, we are going to use the IMDB collaboration network, building two completely different graphs and analyzing their proprieties.
% Given a connected graph $G=(V,E)$, the closeness centrality of a vertex $v$ is defined as $ \frac{n-1}{\sum_{w \in V} d(v,w)}$. This measure is widely used in the analysis of real-world complex networks, and the problem of selecting the $k$ most central vertices has been deeply analysed in the last decade. However, this problem is computationally not easy, especially for large networks. I propose an algorithm for selecting the $k$ most central nodes in a graph: I experimentally show that this algorithm improves significantly both the textbook algorithm, which is based on computing the distance between all pairs of vertices, and the state of the art. Finally, as a case study, I compute the $10$ most central actors in the IMDB collaboration network, where two actors are linked if they played together in a movie. % Given a connected graph $G=(V,E)$, the closeness centrality of a vertex $v$ is defined as $ \frac{n-1}{\sum_{w \in V} d(v,w)}$. This measure is widely used in the analysis of real-world complex networks, and the problem of selecting the $k$ most central vertices has been deeply analysed in the last decade. However, this problem is computationally not easy, especially for large networks. I propose an algorithm for selecting the $k$ most central nodes in a graph: I experimentally show that this algorithm improves significantly both the textbook algorithm, which is based on computing the distance between all pairs of vertices, and the state of the art. Finally, as a case study, I compute the $10$ most central actors in the IMDB collaboration network, where two actors are linked if they played together in a movie.
% Da cambiare le parole, preso dal paper % Da cambiare le parole, preso dal paper

@ -5,36 +5,34 @@ In a connected graph, given a node $v \in V$, we can define the its farness as
\begin{equation} \begin{equation}
f(v) = \frac{1}{c(v)} = \frac{1}{n-1} \displaystyle \sum_{w \in V} d(v,w) f(v) = \frac{1}{c(v)} = \frac{1}{n-1} \displaystyle \sum_{w \in V} d(v,w)
\end{equation} \end{equation}
where $c(v)$ is the closeness centrality defined in \eqref{closeness}. Since we are working with a disconnected graph, a natural generalization of this formula is where $c(v)$ is the closeness centrality defined in \eqref{closeness}. Since we are working with a disconnected graph, a natural generalization of this formula is
\begin{equation}\label{wrong-farness} \begin{equation}\label{wrong-farness}
f(v) = \frac{1}{c(v)} = \frac{1}{r(v)-1} \displaystyle \sum_{w \in V} d(v,w) f(v) = \frac{1}{c(v)} = \frac{1}{r(v)-1} \displaystyle \sum_{w \in V} d(v,w)
\end{equation} \end{equation}
where $r(v) = |R(v)|$ is the cardinality of the set of reachable nodes from $v$. To avoid any problem during the computation, this formula still needs to be modified. Let's assume that the node $v$ that we are considering has just one link at distance $1$ with another node $w$ with \emph{out-degree} 0. If we consider the formula \eqref{wrong-farness} we will get a false result: $v$ would appear to be very central, even if it's obviously very peripheral. To avoid this problem, we can generalize the formula \eqref{wrong-farness} normalizing as suggested in \texttt{[Lin 1976; Wasserman and Faust 1994; Boldi and Vigna 2013; 2014; Olsen et al. 2014]}
where $r(v) = |R(v)|$ is the cardinality of the set of reachable nodes from $v$. To avoid any problem during the computation, this formula still needs to be modified. Let's assume the nodes $v$ that we are considering has just a link at distance $1$ with another node $w$ with \emph{out-degree} 0. If we consider the formula \eqref{wrong-farness} we will get a false result: $v$ would appear to be very central, even if it's obviously very peripheral. To avoid this problem, we can generalize the formula \eqref{wrong-farness} normalizing as suggested in \texttt{[Lin 1976; Wasserman and Faust 1994; Boldi and Vigna 2013; 2014; Olsen et al. 2014]}
\begin{equation}\label{farness} \begin{equation}\label{farness}
f(v) = \frac{n-1}{(r(v)-1)^2} \sum_{w \in R(v)} d(v,w) f(v) = \frac{n-1}{(r(v)-1)^2} \sum_{w \in R(v)} d(v,w)
\end{equation} \end{equation}
With the convention that is a case of $\frac{0}{0}$ we set the closeness of $v$ to 0 With the convention that in a case of $\frac{0}{0}$ we set the closeness of $v$ to 0
\subsection{The lower bound technique} \subsection{The lower bound technique}
During the computation of the farness, for each node, we have to compute the distance from that node and all the other one reachable from it. Since we are dealing with millions of nodes, it's not possibile in a reasonable time. In order to compute only the top-$k$ most central node we need to find a way to avoid computing BFS for nodes that won't be in the top-$k$. \\ During the computation of the farness, for each node, we have to compute the distance from that node and all the other ones reachable from it. Since we are dealing with millions of nodes, it's not possibile in a reasonable time. In order to compute only the top-$k$ most central node we need to find a way to avoid computing BFS for nodes that won't be in the top-$k$. \s
\noindent The idea is to keep track of a lower bound on the farness for each node that we will compute. This will allow us to kill the BFS operation before reaches the end if the lower bound tell us that the node will not be in the top-$k$. More precisely: \noindent The idea is to keep track of a lower bound on the farness for each node that we will compute. This will allow us to kill the BFS operation before reaches the end if the lower bound tell us that the node will not be in the top-$k$. More precisely:
\begin{itemize} \begin{itemize}
\item The algorithm will compute the farness of the first $k$ nodes, saving them in a vector \texttt{top-actors}. From now on, this vector will be full \item The algorithm will compute the farness of the first $k$ nodes, saving them in a vector \texttt{top-actors}. From now on, this vector will be full.
\item Then, for all the next vertices, it defines a lower bound \item Then, for all the next vertices, it defines a lower bound
\begin{equation}\label{lower-bound} \begin{equation}\label{lower-bound}
\frac{n-1}{(n-1)^2} (\sigma_{d-1} + n_d \cdot d) \frac{n-1}{(n-1)^2} (\sigma_{d-1} + n_d \cdot d)
\end{equation} \end{equation}
where $\sigma_d$ is the partial sum in \eqref{farness} at the level of exploration $d$. The lower bound \eqref{lower-bound} is updated every time that we change level of exploration during the BFS. In this way, if at a change of level the lower bound of the vertex that we are considering is bigger than the $k-th$ element of \texttt{top-actors}, we can kill the BFS. The reason behind that is very simple: the vector \texttt{top-actors} is populated with the top-k nodes in order and the farness is inversely proportional to the closeness centrality. So if at that level the lower bound is already bigger than the last element of the vector, there is no need to compute the other level of the BFS since it will not be added in \texttt{top-actors} anyway. \\ where $\sigma_d$ is the partial sum in \eqref{farness} at the level of exploration $d$. The lower bound \eqref{lower-bound} is updated every time that we change level of exploration during the BFS. In this way, if at a change of level the lower bound of the vertex that we are considering is bigger than the $k-th$ element of \texttt{top-actors}, we can kill the BFS. The reason behind that is very simple: the vector \texttt{top-actors} is populated with the top-k nodes in order and the farness is inversely proportional to the closeness centrality. So if at that level the lower bound is already bigger than the last element of the vector, there is no need to compute the other level of the BFS since it will not be added in \texttt{top-actors} anyway. \s
The \eqref{lower-bound} it's a worst case scenario, and makes it perfect for a lower bound. If we are at the level $d$ of exploration, we have already computed the sum in \eqref{farness} up to the level $d-1$. Then we need consider in our computation of the sum the current level of exploration: the worst case gives us that it's linked to all the nodes at distance $d$. We also put $r(v)=n$, in the case that our graph is strongly connected and all vertices are reachable form $v$ The \eqref{lower-bound} it's a worst case scenario, and that makes it perfect for a lower bound. If we are at the level $d$ of exploration, we have already computed the sum in \eqref{farness} up to the level $d-1$. Then we need consider in our computation of the sum the current level of exploration: the worst case gives us that it's linked to all the nodes at distance $d$. We also put $r(v)=n$, in the case that our graph is strongly connected and all vertices are reachable form $v$.
\end{itemize} \end{itemize}
\textsc{Scrivere pseudocodice} \textsc{Scrivere pseudocodice}

@ -0,0 +1,35 @@
\section{An overview of the code}
The algorithm implement is multi-threaded and written in C\texttt{++}
\subsection{Data structures}
In this case we are working with two simple \texttt{struct} for the classes \emph{Film} and \emph{Actor}
\lstinputlisting[language=c++]{code/struct.cpp}
\s
\nd Then we need two dictionaries build like this
\lstinputlisting[language=c++]{code/map.cpp}
\s
\nd We are considering the files \texttt{Attori.txt} and \texttt{FilmFiltrati.txt}, we don't need the relations one for now. Once that we have read this two files, we loop on each one brutally filling the two dictionaries created before. If a line is empty, we skip it. We are using a try and catch approach. Even if the good practice is to use it only for a specific error, since we are outputting everything on the terminal it makes sense to \emph{catch} any error.
\lstinputlisting[language=c++]{code/data.cpp}
\s
Now we can use the file \texttt{Relazioni.txt}. As before, we loop on all the elements of this file, creating the variables
\begin{itemize}
\item \texttt{id\textunderscore film}: index key of each movie
\item \texttt{id\textunderscore attore}: index key of each actor
\end{itemize}
\nd If they both exists, we update the list of indices of movies that the actor/actresses played in. In the same way, we updated the list of indices of actors/actresses that played in the movies with that id.
\lstinputlisting[language=c++]{code/graph.cpp}
\s
Now that we have defined how to build this graph, we have to implement the algorithm what will return the top-k central elements. \s
\nd The code can be found here: \url{https://github.com/lukefleed/imdb-graph}
\s
\begin{center}
\qrcode[height=1in]{https://github.com/lukefleed/imdb-graph}
\end{center}

@ -0,0 +1,39 @@
void DataRead()
{
ifstream actors("data/Attori.txt");
ifstream movies("data/FilmFiltrati.txt");
string s,t;
const string space /* the final frontier */ = "\t";
for (int i = 1; getline(actors,s); i++)
{
if (s.empty())
continue;
try {
Actor TmpObj;
int id = stoi(s.substr(0, s.find(space)));
TmpObj.name = s.substr(s.find(space)+1);
A[id] = TmpObj; // Python notation, works with C++17
if (id > MAX_ACTOR_ID)
MAX_ACTOR_ID = id;
} catch (...) {
cout << "Could not read the line " << i << " of Actors file" << endl;
}
}
for (int i = 1; getline(movies,t); i++)
{
if (t.empty())
continue;
try{
Film TmpObj;
int id = stoi(t.substr(0, t.find(space)));
TmpObj.name = t.substr(t.find(space)+1);
F[id] = TmpObj;
} catch (...) {
cout << "Could not read the line " << i << " of Film file" << endl;
}
}
}

@ -0,0 +1,21 @@
void BuildGraph()
{
ifstream relations("data/Relazioni.txt");
string s;
const string space = "\t";
for (int i=1; getline(relations,s); i++){
if (s.empty())
continue;
try {
int id_film = stoi(s.substr(0, s.find(space)));
int id_attore = stoi(s.substr(s.find(space)+1));
if (A.count(id_attore) && F.count(id_film)) { // Exclude movies and actors filtered
A[id_attore].film_indices.push_back(id_film);
F[id_film].actor_indicies.push_back(id_attore);
}
} catch (...) {
cout << "Could not read the line " << i << " of Releations file" << endl;
}
}
}

@ -0,0 +1,2 @@
map<int, Actor> A; // Dictionary {actor_id (key): Actor (value)}
map<int, Film> F; // Dictionary {film_id (key): Film (value)}

@ -0,0 +1,9 @@
struct Film {
string name;
vector<int> actor_indicies;
};
struct Actor {
string name;
vector<int> film_indices;
};

@ -2,17 +2,18 @@
The algorithm shown before can be applied to any dataset on which is possibile to build a graph on. In this case we are considering tha data taken from the \emph{Internet Movie Database} (IMDB). The algorithm shown before can be applied to any dataset on which is possibile to build a graph on. In this case we are considering tha data taken from the \emph{Internet Movie Database} (IMDB).
\subsection{Data Structure} \subsection{Data Structure}
All the data used can be downloaded here: \url{https://datasets.imdbws.com/} \\ All the data used can be downloaded here: \url{https://datasets.imdbws.com/} \s
\noindent In particolar we're interest in 3 files \noindent In particolar we're interest in 3 files
\begin{itemize} \begin{itemize}
\item \texttt{title.basics.tsv} \item \texttt{title.basics.tsv}
\item \texttt{title.principals.tsv} \item \texttt{title.principals.tsv}
\item \texttt{name.basics.tsv} \item \texttt{name.basics.tsv}
\item \texttt{title.ratings.tsv}
\end{itemize} \end{itemize}
Let's have a closer look to this 3 files: Let's have a closer look to this 4 files:
\subsubsection*{title.basics.tsv.gz} \subsubsection*{title.basics.tsv}
\emph{Contains the following information for titles:} \emph{Contains the following information for titles:}
\begin{itemize} \begin{itemize}
\item \texttt{tconst} (string) - alphanumeric unique identifier of the title \item \texttt{tconst} (string) - alphanumeric unique identifier of the title
@ -26,7 +27,7 @@ Let's have a closer look to this 3 files:
\item \texttt{genres} (string array) includes up to three genres associated with the title \item \texttt{genres} (string array) includes up to three genres associated with the title
\end{itemize} \end{itemize}
\subsubsection*{title.principals.tsv.gz} \subsubsection*{title.principals.tsv}
\emph{Contains the principal cast/crew for titles:} \emph{Contains the principal cast/crew for titles:}
\begin{itemize} \begin{itemize}
\item \texttt{tconst} (string) - alphanumeric unique identifier of the title \item \texttt{tconst} (string) - alphanumeric unique identifier of the title
@ -37,7 +38,7 @@ Let's have a closer look to this 3 files:
\item \texttt{characters} (string) - the name of the character played if applicable \item \texttt{characters} (string) - the name of the character played if applicable
\end{itemize} \end{itemize}
\subsubsection*{name.basics.tsv.gz} \subsubsection*{name.basics.tsv}
\emph{Contains the following information for names:} \emph{Contains the following information for names:}
\begin{itemize} \begin{itemize}
\item \texttt{nconst} (string) - alphanumeric unique identifier of the name/person \item \texttt{nconst} (string) - alphanumeric unique identifier of the name/person
@ -48,10 +49,20 @@ Let's have a closer look to this 3 files:
\item \texttt{knownForTitles} (array of tconsts) titles the person is known for \item \texttt{knownForTitles} (array of tconsts) titles the person is known for
\end{itemize} \end{itemize}
\subsubsection*{title.ratings.tsv}
\emph{Contains the following information for titles:}
\begin{itemize}
\item \texttt{tconst} (string) - alphanumeric unique identifier of the title
\item \texttt{averageRating} weighted average of all the individual user ratings
\item \texttt{numVotes} number of votes the title has received
\end{itemize}
\newpage \newpage
\subsection{Filtering} \subsection{Filtering}
This is a crucial section for the algorithm in this particolar case study. This raw data contains a huge amount of un-useful information that will just have a negative impact on the performance during the computation. We are going to see in detail all the modification made for each file. All this operation have been implemented using \texttt{python} and the \texttt{pandas} library. This is a crucial section for the algorithm in this particolar case study. This raw data contains a huge amount of un-useful information that will just have a negative impact on the performance during the computation. We are going to see in detail all the modification made for each file. All this operation have been implemented using \texttt{python} and the \texttt{pandas} library. \s
Since we want to build two different graph, some consideration will have to be considered for the specific case. If nothing is told it means that the filtering of that file is the same for both graphs.
\subsubsection{name.basics.tsv} \subsubsection{name.basics.tsv}
@ -62,7 +73,7 @@ For this file we only need the following columns
\item \texttt{primaryTitle} \item \texttt{primaryTitle}
\item \texttt{primaryProfession} \item \texttt{primaryProfession}
\end{itemize} \end{itemize}
Since all the actors starts with the string \texttt{nm0} we can remove it to clean the output. Furthermore a lot of actors/actresses do more than one job (director etc..). To avoid excluding important actors we consider all the ones that have the string \texttt{actor/actress} in their profession. In this way, both someone who is classified as \texttt{actor} or as \texttt{actor, director} is taken into consideration. \\ Since all the actors starts with the string \texttt{nm0} we can remove it to clean the output. Furthermore a lot of actors/actresses do more than one job (director etc..). To avoid excluding important actors we consider all the ones that have the string \texttt{actor/actress} in their profession. In this way, both someone who is classified as \texttt{actor} or as \texttt{actor, director} is taken into consideration. \s
\noindent Then we can generate the final filtered file \texttt{Attori.txt} that has only two columns: \texttt{nconst} and \texttt{primaryName} \noindent Then we can generate the final filtered file \texttt{Attori.txt} that has only two columns: \texttt{nconst} and \texttt{primaryName}
@ -77,7 +88,7 @@ For this file we only need the following columns
\item \texttt{isAdult} \item \texttt{isAdult}
\item \texttt{titleType} \item \texttt{titleType}
\end{itemize} \end{itemize}
Since all the movies starts with the string \texttt{t0} we can remove it to clean the output. In this case, we also want to remove all the movies for adults. This part can be optional if we are interest only in the closeness and harmonic centrality. Even if the actors and actresses of the adult industry use to make a lot of movies together, this won't alter the centrality result. As we know, an higher closeness centrality can be seen as the ability of a node to spread efficiently information in the network. Including the adult industry would lead to the creation of a very dense and isolated neighborhood. But none of those nodes will have an higher closeness centrality because they only spread information in their community. This phenomenon will be discussed more deeply in the analysis of the graph visualized. \\ Since all the movies starts with the string \texttt{t0} we can remove it to clean the output. In this case, we also want to remove all the movies for adults. This part can be optional if we are interest only in the closeness and harmonic centrality. Even if the actors and actresses of the adult industry use to make a lot of movies together, this won't alter the centrality result. As we know, an higher closeness centrality can be seen as the ability of a node to spread efficiently information in the network. Including the adult industry would lead to the creation of a very dense and isolated neighborhood. But none of those nodes will have an higher closeness centrality because they only spread information in their community. This phenomenon will be discussed more deeply in the analysis of the graph visualized. \s
\noindent We can also notice that there is a lot of \emph{junk} in IMDb. To avoid dealing with un-useful data, we are considering all the non-adult movies in this whitelist \noindent We can also notice that there is a lot of \emph{junk} in IMDb. To avoid dealing with un-useful data, we are considering all the non-adult movies in this whitelist
@ -87,19 +98,42 @@ Since all the movies starts with the string \texttt{t0} we can remove it to clea
\item \texttt{tvMovie} \item \texttt{tvMovie}
\item \texttt{tvMiniSeries} \item \texttt{tvMiniSeries}
\end{itemize} \end{itemize}
The reason to only consider this categories is purely to optimize the performance during the computation. On IMDb each episode is listed as a single element: to remove them without loosing the most important relations, we only consider the category \texttt{tvSeries}. This category list a TV-Series as a single element, not divided in multiple episodes. In this way we will loose some of the relations with minor actors that may appear in just a few episodes. But we will have preserved the relations between the protagonists of the show. \\ The reason to only consider this categories is purely to optimize the performance during the computation. On IMDb each episode is listed as a single element: to remove them without loosing the most important relations, we only consider the category \texttt{tvSeries}. This category list a TV-Series as a single element, not divided in multiple episodes. In this way we will loose some of the relations with minor actors that may appear in just a few episodes. But we will have preserved the relations between the protagonists of the show. \s
\noindent Then we can generate the final filtered file \texttt{FilmFiltrati.txt} that has only two columns: \texttt{tconst} and \texttt{primaryTitle} \noindent Then we can generate the final filtered file \texttt{FilmFiltrati.txt} that has only two columns: \texttt{tconst} and \texttt{primaryTitle}
\subsubsection{title.principals.tsv} \subsubsection{title.principals.tsv}
For this file we only need the following columns This file is needed for the analysis of both graphs, but there some different observation between them. For the both we only need the following columns
\begin{itemize} \begin{itemize}
\item \texttt{tconst} \item \texttt{tconst}
\item \texttt{nconst} \item \texttt{nconst}
\item \texttt{category} \item \texttt{category}
\end{itemize} \end{itemize}
As before, we clean the output removing unnecessary strings. Then we create an array of unique actor ids (\texttt{nconst}) and an array of how may times they appear (\texttt{counts}). This will give us the number of movies they appear in. And here it comes the core of this filtering. \\
\noindent Let's define a constant \texttt{{MINMOVIES}}. This integer is the minimum number of movies that an actor needs to have made in his carrier to be considered in this graph. The reason to do that it's purely computational. If an actor/actress has less then a reasonable number of movies made in his carrier, there is an high probability that he/she has an important role in our graph during the computation of the centralities. \noindent As before, we clean the output removing unnecessary strings. \s
\textsc{Actors Graph}
\s
\noindent Using the data obtained before we create an array of unique actor ids (\texttt{nconst}) and an array of how may times they appear (\texttt{counts}). This will give us the number of movies they appear in. And here it comes the core of the optimization for this graph. Let's define a constant \texttt{MINMOVIES}. This integer is the minimum number of movies that an actor needs to have made in his carrier to be considered in this graph. The reason to do that it's purely computational. If an actor/actress has less then a reasonable number of movies made in his carrier, there is an high probability that he/she has an important role in our graph during the computation of the centralities. \s
\textsc{Movies Graph} \s
\noindent For this graph we don't need any optimization on this file. We just clean clean the output and leave the rest as it is \s
\nd At the end, for both graph, we can finally generate the file \texttt{Relazioni.txt} containing the columns \texttt{tconst} and \texttt{nconst}
\subsubsection{title.ratings.tsv}
This file is necessary just in the analysis of the movie graph, it won't be even downloaded for the analysis of the actors graph. We will only need the following columns
\begin{itemize}
\item \texttt{tconst}
\item \texttt{numVotes}
\end{itemize}
\nd The idea behind the optimization made in this file is the same that we have used before with the \texttt{MINMOVIES} technique. We want to avoid computing movies that are not central with an high probability. To do that we consider the number of votes that each movies has received on the IMDB website. To do that we introduce the constant \texttt{VOTES}, considering only the movies with an higher number of votes.During the analysis we will change this value to see how it effects the list of the top-k most central movies. \s
\nd In this case we don't have to generate a new file, we can apply this condition to \texttt{FilmFiltrati.txt}

@ -1,5 +1,5 @@
\section{Introduction} \section{Introduction}
A graph $G= (V,E)$ is a pair of a sets. Where $V = \{v_1,...,v_n\}$ is the set \emph{nodes}, and $E \subseteq V \times V, ~ E = \{(v_i,v_j),...\}$ is the set of \emph{edges} (with $|E| = m \leq n^2$). \\ A graph $G= (V,E)$ is a pair of a sets. Where $V = \{v_1,...,v_n\}$ is the set \emph{nodes}, and $E \subseteq V \times V, ~ E = \{(v_i,v_j),...\}$ is the set of \emph{edges} (with $|E| = m \leq n^2$). \s
In this paper we discuss the problem of identifying the most central nodes in a network using the measure of \emph{closeness centrality}. Given a connected graph, the closeness centrality of a node $v \in V$ is defined as the reciprocal of the sum of the length of the shortest paths between the node and all other nodes in the graph. Normalizing we obtain the following formula: In this paper we discuss the problem of identifying the most central nodes in a network using the measure of \emph{closeness centrality}. Given a connected graph, the closeness centrality of a node $v \in V$ is defined as the reciprocal of the sum of the length of the shortest paths between the node and all other nodes in the graph. Normalizing we obtain the following formula:
@ -7,20 +7,25 @@ In this paper we discuss the problem of identifying the most central nodes in a
c(v) = \frac{n-1}{\displaystyle \sum_{w \in V} d(v,w)} c(v) = \frac{n-1}{\displaystyle \sum_{w \in V} d(v,w)}
\end{equation} \end{equation}
where $n$ is the cardinality of $V$ and $d(v,w)$ is the distance between $v,w \in V$. This is a very powerful tool for the analysis of a network: it ranks each node telling us the most efficient ones in spreading information through all the other nodes in the graph. As mentioned before, the denominator of this definition give us the length of the shortest path between two nodes. This means that for a node to be central, the average number of links needed to reach another node has to be low. The goal of this paper is to computer the $k$ vertices with the higher closeness centrality. \\ where $n$ is the cardinality of $V$ and $d(v,w)$ is the distance between $v,w \in V$. This is a very powerful tool for the analysis of a network: it ranks each node telling us the most efficient ones in spreading information through all the other nodes in the graph. As mentioned before, the denominator of this definition give us the length of the shortest path between two nodes. This means that for a node to be central, the average number of links needed to reach another node has to be low. The goal of this paper is to computer the $k$ vertices with the higher closeness centrality. \s
\noindent As case study we are using the collaboration graph of the actors in the \emph{Internet Movie Database} (IMDB). On this data we define an undirected graph $G=(V,E)$ where \noindent As case study we are using the collaboration network in the \emph{Internet Movie Database} (IMDB). We are going to consider two different graphs. For the first one we define an undirected graph $G=(V,E)$ where
\begin{itemize} \begin{itemize}
\item the vertex $V$ are the actor and the actress \item the vertex $V$ are the actor and the actress
\item the non oriented edges in $E$ links the actors and the actresses if they played together in a movie. \item the non oriented edges in $E$ links the actors and the actresses if they played together in a movie.
\end{itemize} \end{itemize}
For the second one we do the opposite thing: we define an undirected graph $G=(V,E)$ where
\begin{itemize}
\item the vertices $V$ are the movies
\item the non oriented edges in $E$ links two movies if they have an actor or actress in common
\end{itemize}
\subsection{The Problem} \subsection{The Problem}
We are dealing with a web-scale network: any brute force algorithm would require years to end. The main difficulty is caused by the computation of distance $d(v,w)$. This is a well know problem: \emph{All Pairs Shortest Paths or APSP problem}. \\ Since we are dealing with a web-scale network any brute force algorithm would require years to end. The main difficulty here is caused by the computation of distance $d(v,w)$ in \eqref{closeness}. This is a well know problem known as \emph{All Pairs Shortest Paths or APSP problem}. \s
\noindent We can solve the APSP problem either using the fast matrix multiplication or, as I did, implementing a breath-first-search (BFS) method. There are several reason to prefer this second approach over the first one in this type of problems. \\ \noindent We can solve the APSP problem either using the fast matrix multiplication or, as made in this paper, implementing a breath-first-search (BFS) method. There are several reason to prefer this second approach over the first one in this type of problems. \s
\noindent A graph is a data structure and we can describe it in different ways. Choosing one over another can have an enormous impact on performance. In this case, we need to remember the type of graph that we are dealing with: a very big and sparse one. The fast matrix multiplication requires to consider our graph as an $n\times n$ matrix where the position $(i,j)$ is zero if the nodes $i,j$ are not linked, 1 (or a generic number if weighted) otherwise. This method requires $O(n^2)$ space in memory, that is an enormous quantity on a web-scale graph. Furthermore the time complexity is $O(n^{2.373} \log n)\}$ \texttt{[Zwick 2002; Williams 2012]} \\ \noindent A graph is a data structure and we can describe it in different ways. Choosing one over another can have an enormous impact on performance. In this case, we need to remember the type of graph that we are dealing with: a very big and sparse one. The fast matrix multiplication implement the graph as an $n\times n$ matrix where the position $(i,j)$ is zero if the nodes $i,j$ are not linked, 1 (or a generic number if weighted) otherwise. This method requires $O(n^2)$ space in memory, that is an enormous quantity on a web-scale graph. Furthermore the time complexity is $O(n^{2.373} \log n)\}$ \texttt{[Zwick 2002; Williams 2012]} \s
\noindent Using the BFS method the space complexity is $O(n+m)$, which is a very lower value compared to the previous method. In terms of time, the complexity is $O(nm)$. Unfortunately, this is not enough to compute all the distances in a reasonable time. It is also been proven that this method can not be improved. In this paper I will propose an exact algorithm to compute the top-$k$ nodes with the higher closeness centrality. I will also discuss an interesting and original relation between the physics of the visualized graph and the nodes with different centrality values. \noindent Using the BFS method the space complexity is $O(n+m)$, which is a very lower value compared to the previous method. In terms of time, the complexity is $O(nm)$. Unfortunately, this is not enough to compute all the distances in a reasonable time. It is also been proven that this method can not be improved. In this paper I propose an exact algorithm to compute the top-$k$ nodes with the higher closeness centrality.

Binary file not shown.

@ -12,6 +12,8 @@
\usepackage{imakeidx} \usepackage{imakeidx}
\usepackage{algpseudocode} \usepackage{algpseudocode}
\usepackage{hyperref} \usepackage{hyperref}
\usepackage{textcomp}
\usepackage{qrcode}
\newcommand{\N}{\mathbb{N}} \newcommand{\N}{\mathbb{N}}
\newcommand{\Z}{\mathbb{Z}} \newcommand{\Z}{\mathbb{Z}}
@ -26,6 +28,9 @@
\newcommand{\Zn}{\Z/n\Z} \newcommand{\Zn}{\Z/n\Z}
\newcommand{\Zp}{\Z_p} \newcommand{\Zp}{\Z_p}
\newcommand{\Zmn}{\Z/mn\Z} \newcommand{\Zmn}{\Z/mn\Z}
\newcommand{\s}{\vspace*{0.4 cm}}
\newcommand{\nd}{\noindent}
\definecolor{codegreen}{rgb}{0,0.6,0} \definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5} \definecolor{codegray}{rgb}{0.5,0.5,0.5}
@ -53,9 +58,9 @@
\lstset{style=mystyle} \lstset{style=mystyle}
\title{Computing top-k Closeness Centrality Faster in Unweighted Graphs} \title{An exact and fast algorithm for computing top-k closeness centrality}
\author{Luca Lombardo} \author{Luca Lombardo}
\date{} \date{Univeristy of Pisa - Department of Mathematics}
\begin{document} \begin{document}
@ -70,6 +75,6 @@
\include{introduction.tex} \include{introduction.tex}
\include{algorithm.tex} \include{algorithm.tex}
\include{data.tex} \include{data.tex}
\include{code.tex}
\end{document} \end{document}

@ -1,80 +0,0 @@
#!/usr/bin/env python3
from itertools import combinations
from math import comb
from matplotlib.pyplot import title
import networkx as nx
from pyvis.network import Network
net = Network(height='100%', width='100%', directed=False, bgcolor='#1e1f29', font_color='white')
actors_to_keep = []
farness_to_keep= []
with open('data/top_actors_c.txt') as ifs:
for line in ifs:
if line.strip():
actor_id, farness = line.split(maxsplit=1)
actors_to_keep.append(int(actor_id))
farness_to_keep.append(float(farness))
with open('data/Attori.txt') as ifs:
for line in ifs:
if line.strip():
actor_id, actor_name = line.split(maxsplit=1)
actor_id = int(actor_id)
farness = float(farness)
if actor_id in actors_to_keep:
if farness in farness_to_keep:
net.add_node(actor_id, label=actor_name, size =pow(5,1.0/(farness*2)))
movies = {} # {movie_id: [actor_id, ...]}
with open('data/Relazioni.txt') as ifs:
for line in ifs:
if line.strip():
movie_id, actor_id = line.split(maxsplit=1)
actor_id = int(actor_id)
movie_id = int(movie_id)
if actor_id not in net.node_ids:
continue
if movie_id in movies:
movies[movie_id].append(actor_id)
else:
movies[movie_id] = [actor_id]
edges = set() # set of unique tuples (actor_id, actor_id)
for movie_id, actors in movies.items():
actors.sort()
for actor_id_1, actor_id_2 in combinations(actors, 2):
edges.add((actor_id_1, actor_id_2))
for actor_id_1, actor_id_2 in edges:
net.add_edge(actor_id_1, actor_id_2)
# net.hrepulsion(node_distance=500, central_gravity=0.3, spring_length=500, spring_strength=0.05, damping=0.2)
# net.repulsion(node_distance=500, central_gravity=0.3, spring_length=200, spring_strength=0.05, damping=0.2)
# net.show_buttons()
net.set_options("""
var options = {
"nodes": {
"borderWidthSelected": 3
},
"edges": {
"color": {
"inherit": true
},
"smooth": false
},
"physics": {
"repulsion": {
"centralGravity": 8.95,
"springLength": 500,
"springConstant": 0.015,
"nodeDistance": 600,
"damping": 0.67
},
"minVelocity": 0.75,
"solver": "repulsion"
}
}
""")
net.show('html-files/closeness-graph.html')

@ -1,75 +0,0 @@
#!/usr/bin/env python3
from itertools import combinations
from math import comb
from matplotlib.pyplot import title
import networkx as nx
from pyvis.network import Network
net = Network(height='100%', width='100%', directed=False, bgcolor='#1e1f29', font_color='white')
actors_to_keep = []
harmonic_to_keep =[]
with open('data/top_actors_h.txt') as ifs:
for line in ifs:
if line.strip():
actor_id, harmonic = line.split(maxsplit=1)
actors_to_keep.append(int(actor_id))
harmonic_to_keep.append(float(harmonic))
with open('data/Attori.txt') as ifs:
for line in ifs:
if line.strip():
actor_id, actor_name = line.split(maxsplit=1)
actor_id = int(actor_id)
harmonic = float(harmonic)
if actor_id in actors_to_keep:
if harmonic in harmonic_to_keep:
net.add_node(actor_id, label=actor_name, size = harmonic/350)
movies = {} # {movie_id: [actor_id, ...]}
with open('data/Relazioni.txt') as ifs:
for line in ifs:
if line.strip():
movie_id, actor_id = line.split(maxsplit=1)
actor_id = int(actor_id)
movie_id = int(movie_id)
if actor_id not in net.node_ids:
continue
if movie_id in movies:
movies[movie_id].append(actor_id)
else:
movies[movie_id] = [actor_id]
edges = set() # set of unique tuples (actor_id, actor_id)
for movie_id, actors in movies.items():
actors.sort()
for actor_id_1, actor_id_2 in combinations(actors, 2):
edges.add((actor_id_1, actor_id_2))
for actor_id_1, actor_id_2 in edges:
net.add_edge(actor_id_1, actor_id_2)
# net.hrepulsion(node_distance=500, central_gravity=0.3, spring_length=500, spring_strength=0.05, damping=0.2)
# net.repulsion(node_distance=500, central_gravity=0.3, spring_length=200, spring_strength=0.05, damping=0.2)
# net.show_buttons()
net.set_options("""
var options = {
"edges": {
"color": {
"inherit": true
},
"smooth": false
},
"physics": {
"repulsion": {
"springLength": 1205,
"nodeDistance": 1190
},
"maxVelocity": 23,
"minVelocity": 0.75,
"solver": "repulsion"
}
}
""")
net.show('html-files/harmonic-graph.html')
Loading…
Cancel
Save