From a07533193e65d19a6ecff708d423649c0f30a406 Mon Sep 17 00:00:00 2001 From: Luca Lombardo Date: Sun, 6 Mar 2022 22:57:42 +0100 Subject: [PATCH] created a new graph over the movies, riorganized foledrs --- filters/actors_graph_filter.py | 86 ++++++++ filters/movie_graph_filter.py | 82 +++++++ scripts/actors_graph.cpp | 388 +++++++++++++++++++++++++++++++++ scripts/movie_graph.cpp | 387 ++++++++++++++++++++++++++++++++ 4 files changed, 943 insertions(+) create mode 100755 filters/actors_graph_filter.py create mode 100755 filters/movie_graph_filter.py create mode 100644 scripts/actors_graph.cpp create mode 100644 scripts/movie_graph.cpp diff --git a/filters/actors_graph_filter.py b/filters/actors_graph_filter.py new file mode 100755 index 0000000..3dc65dd --- /dev/null +++ b/filters/actors_graph_filter.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +import gzip +import requests +import pandas as pd +import numpy as np +import os +import csv + +MIN_MOVIES = 42 # Only keep relations for actors that have made more than this many movies + +#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------# +def colored(r, g, b, text): + return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text) + +def download_url(url): + print("Downloading:", url) + file_name_start_pos = url.rfind("/") + 1 + file_name = url[file_name_start_pos:] + if os.path.isfile(file_name): + print(colored(0,170,0,"Already downloaded: skipping")) + return + + r = requests.get(url, stream=True) + r.raise_for_status() + with open(file_name, 'wb') as f: + for chunk in r.iter_content(chunk_size=4096): + f.write(chunk) + return url + +urls = ["https://datasets.imdbws.com/name.basics.tsv.gz", + "https://datasets.imdbws.com/title.principals.tsv.gz", + "https://datasets.imdbws.com/title.basics.tsv.gz", + "https://datasets.imdbws.com/title.ratings.tsv.gz"] + +for url in urls: + download_url(url) + +os.makedirs("../data/data_actor_graph", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists + +#------------------------------FILTERING------------------------------# + +print("Filtering actors...") +df_attori = pd.read_csv( + 'name.basics.tsv.gz', sep='\t', compression='gzip', + usecols=['nconst', 'primaryName', 'primaryProfession'], # Considering only this columns + dtype={'primaryName': 'U', 'primaryProfession': 'U'}, # Both are unsigned integers + converters={'nconst': lambda x: int(x.lstrip("nm0"))}) # All actors starts with nm0, we are just cleaning the output +df_attori.query('primaryProfession.str.contains("actor") or primaryProfession.str.contains("actress")', inplace=True) +# A lot of actors/actresses do more than one job (director etc..), with this comand I take all the names that have the string "actor" or "actress" in their profession. In this way both someone who is classified as "actor" or as "actor, director" are taken into consideration + +print("Filtering films...") +df_film = pd.read_csv( + 'title.basics.tsv.gz', sep='\t', compression='gzip', + usecols=['tconst', 'primaryTitle', 'isAdult', 'titleType'], # Considering only this columns + dtype={'primaryTitle': 'U', 'titleType': 'U'}, # Both are unsigned integers + converters={'tconst': lambda x: int(x.lstrip("t0")), 'isAdult': lambda x: x != "0"}) # # All movies starts with t0, we are just cleaning the output. Then remove all adult movies +df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]', + inplace=True) # There are a lot of junk categories considered in IMDb, we are considering all the non Adult movies in this whitelist +filtered_tconsts = df_film["tconst"].to_list() + +print("Filtering relations...") +df_relazioni = pd.read_csv( + 'title.principals.tsv.gz', sep='\t', compression='gzip', + usecols=['tconst', 'nconst','category'], # Considering only this columns + dtype={'category': 'U'}, # Unsigned integer + converters={'nconst': lambda x: int(x.lstrip("nm0")), 'tconst': lambda x: int(x.lstrip("t0"))}) # Cleaning +df_relazioni.query('(category == "actor" or category == "actress") and tconst in @filtered_tconsts', inplace=True) +# Returns an array of unique actor ids (nconsts) and an array of how many times they appear (counts) => the number of movies they appear in +nconsts, counts = np.unique(df_relazioni["nconst"].to_numpy(), return_counts=True) +filtered_nconsts = nconsts[counts>=MIN_MOVIES] +df_relazioni.query("nconst in @filtered_nconsts", inplace=True) + +# Now select only films and actors that have at lest a relation +print("Re-filtering actors...") +nconsts_with_relations = df_relazioni["nconst"].unique() +df_attori.query("nconst in @nconsts_with_relations", inplace=True) +print("Re-filtering films...") +tconsts_with_relations = df_relazioni["tconst"].unique() +df_film.query("tconst in @tconsts_with_relations", inplace=True) + +# Write the filtered files +df_attori.to_csv('../data/data_actor_graph/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['nconst', 'primaryName'], header=False, index=False) + +df_film.to_csv('../data/data_actor_graph/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False) + +df_relazioni.to_csv('../data/data_actor_graph/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False) diff --git a/filters/movie_graph_filter.py b/filters/movie_graph_filter.py new file mode 100755 index 0000000..08639f1 --- /dev/null +++ b/filters/movie_graph_filter.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +import gzip +import requests +import pandas as pd +import numpy as np +import os +import csv + +#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------# +def colored(r, g, b, text): + return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text) + +def download_url(url): + print("Downloading:", url) + file_name_start_pos = url.rfind("/") + 1 + file_name = url[file_name_start_pos:] + if os.path.isfile(file_name): + print(colored(0,170,0,"Already downloaded: skipping")) + return + + r = requests.get(url, stream=True) + r.raise_for_status() + with open(file_name, 'wb') as f: + for chunk in r.iter_content(chunk_size=4096): + f.write(chunk) + return url + +urls = ["https://datasets.imdbws.com/name.basics.tsv.gz", + "https://datasets.imdbws.com/title.principals.tsv.gz", + "https://datasets.imdbws.com/title.basics.tsv.gz", + "https://datasets.imdbws.com/title.ratings.tsv.gz"] + +for url in urls: + download_url(url) + +os.makedirs("../data/data_movie_graph", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists + +#------------------------------FILTERING------------------------------# + +print("Filtering actors...") +df_attori = pd.read_csv( + 'name.basics.tsv.gz', sep='\t', compression='gzip', + usecols=['nconst', 'primaryName', 'primaryProfession'], + dtype={'primaryName': 'U', 'primaryProfession': 'U'}, + converters={'nconst': lambda x: int(x.lstrip("nm0"))}) +df_attori.query('primaryProfession.str.contains("actor") or primaryProfession.str.contains("actress")', inplace=True) + + +print("Filtering movies...") +df_film = pd.read_csv( + 'title.basics.tsv.gz', sep='\t', compression='gzip', + usecols=['tconst', 'primaryTitle', 'isAdult', 'titleType'], # Considering only this columns + dtype={'primaryTitle': 'U', 'titleType': 'U'}, # Both are unsigned integers + converters={'tconst': lambda x: int(x.lstrip("t0")), 'isAdult': lambda x: x != "0"}) # All movies starts with t0, we are just cleaning the output. Then remove all adult movies +df_ratings = pd.read_csv( + 'title.ratings.tsv.gz', sep='\t', compression='gzip', + usecols=['tconst', 'numVotes'], + dtype={'numVotes': 'u8'}, # Unsigned integer + converters={'tconst': lambda x: int(x.lstrip("t0"))}) +df_film = pd.merge(df_film, df_ratings, "left", on="tconst") +del df_ratings +df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]', + inplace=True) +VOTES_MEAN = df_film['numVotes'].mean() +df_film.query('numVotes > @VOTES_MEAN', inplace=True) +filtered_tconsts = df_film["tconst"].to_list() + +print("Filtering relations...") +df_relazioni = pd.read_csv( + 'title.principals.tsv.gz', sep='\t', compression='gzip', + usecols=['tconst', 'nconst','category'], # Considering only this columns + dtype={'category': 'U'}, # Unsigned integer + converters={'nconst': lambda x: int(x.lstrip("nm0")), 'tconst': lambda x: int(x.lstrip("t0"))}) # Cleaning +df_relazioni.query('(category == "actor" or category == "actress") and tconst in @filtered_tconsts', inplace=True) + + +# Write the filtered files +df_attori.to_csv('../data/data_movie_graph/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['nconst', 'primaryName'], header=False, index=False) + +df_film.to_csv('../data/data_movie_graph/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False) + +df_relazioni.to_csv('../data/data_movie_graph/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False) diff --git a/scripts/actors_graph.cpp b/scripts/actors_graph.cpp new file mode 100644 index 0000000..fcca934 --- /dev/null +++ b/scripts/actors_graph.cpp @@ -0,0 +1,388 @@ +// g++ -Wall -pedantic -std=c++17 -Ofast -pthread kenobi.cpp -o kenobi +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // getline +#include // find +#include // ceil +#include + +using namespace std; + +struct Film { + string name; + vector actor_indicies; +}; + +struct Actor { + string name; + vector film_indices; +}; + +map A; // Dictionary {actor_id (key): Actor (value)} +map F; // Dictionary {film_id (value): Film (value)} +int MAX_ACTOR_ID = -1; // Here DataRead() puts the larges actor_id loaded from Attori.txt + +const int N_THREADS = 12; // Number of threads to use for some functions + +void DataRead() +{ + ifstream actors("../data/data_actor_graph/Attori.txt"); // read the file + ifstream movies("../data/data_actor_graph/FilmFiltrati.txt"); // read the file + + string s,t; + const string space /* the final frontier */ = "\t"; + + for (int i = 1; getline(actors,s); i++) + { + if (s.empty()) // jumps empty lines, sometimes can happen + continue; + try { + Actor TmpObj; // Temporary object for the actor class + int id = stoi(s.substr(0, s.find(space))); + TmpObj.name = s.substr(s.find(space)+1); + A[id] = TmpObj; // Matlab/Python notation, works since C++17 + if (id > MAX_ACTOR_ID) + MAX_ACTOR_ID = id; + } catch (...) { + cout << "Could not read the line " << i << " of Actors file" << endl; + } + } + + for (int i = 1; getline(movies,t); i++) + { + if (t.empty()) + continue; + + try{ + Film TmpObj; + int id = stoi(t.substr(0, t.find(space))); + TmpObj.name = t.substr(t.find(space)+1); + F[id] = TmpObj; + } catch (...) { + cout << "Could not read the line " << i << " of Film file" << endl; + } + } +} + +void BuildGraph() +{ + ifstream relations("data/Relazioni.txt"); + string s; + const string space = "\t"; + + for (int i=1; getline(relations,s); i++){ // Scorro relations + if (s.empty()) + continue; + try { + int id_film = stoi(s.substr(0, s.find(space))); // Index of the movie + int id_attore = stoi(s.substr(s.find(space)+1)); // Index of the actor + if (A.count(id_attore) && F.count(id_film)) { // Do not consider the filtered ones + A[id_attore].film_indices.push_back(id_film); + F[id_film].actor_indicies.push_back(id_attore); + } + } catch (...) { + cout << "Could not read the line " << i << " of Releations file" << endl; + } + } +} + +void PrintGraph(size_t max_n_actors = 3) +{ + const size_t n = min(max_n_actors, A.size()); // There could be less film than max actors! + size_t i = 0; + for (const auto& [id_attore, attore] : A) { + cout << id_attore << " (" << attore.name << ")"; + if (!attore.film_indices.empty()) { + cout << ":\n"; + for (int id_film : attore.film_indices) { + cout << "\t- " << id_film << " (" << F[id_film].name << ")\n"; + for (int id_attore_adj : F[id_film].actor_indicies) + if (id_attore_adj != id_attore) + cout << "\t\t* " << id_attore_adj << " (" << A[id_attore_adj].name << ")\n"; + } + } + cout << endl; + + i++; // Taking count of how many are getting printed + if (i >= n) // Stop when I arrive ad n + break; + } +} + +// Find a movie by the title. Gives -1 if there is no match +int FindFilm(string title) +{ + for (const auto& [id, film] : F) + if (film.name == title) + return id; + return -1; +} + +// Find an actor by the name. Gives -1 if there is no match +int FindActor(string name) +{ + for (const auto& [id, actor] : A) + if (actor.name == name) + return id; + return -1; +} + +vector> closeness(const size_t k) { + + vector> top_actors; // Each pair is (actor_index, farness). + top_actors.reserve(k+1); // We need exactly k items, no more and no less. + + vector threads; + mutex top_actors_mutex; // The threads write to top_actors, so another thread reading top_actors at the same time may find it in an invalid state (if the read happens while the other thread is still writing) + threads.reserve(N_THREADS); + for (int i = 0; i < N_THREADS; i++) { + // Launching the threads + threads.push_back(thread([&top_actors,&top_actors_mutex,&k](int start) { + vector enqueued(MAX_ACTOR_ID, false); // Vector to see which vertices with put in the queue during the BSF + // We loop over each vertex + for (int actor_id = start; actor_id <= MAX_ACTOR_ID; actor_id += N_THREADS) { + if (!A.count(actor_id)) // The actor must exist, otherwise A[actor_id] would attempt to write A, and this may produce a race condition if multiple threads do it at the same time + continue; + + // We just compute the farness of our vertex using a BFS + queue> q; // FIFO of pairs (actor_index, distance from our vertex). + for (size_t i = 0; i < enqueued.size(); i++) + enqueued[i] = false; + int r = 0; // |R|, where R is the set of vertices reachable from our vertex + long long int sum_distances = 0; // Sum of the distances to other nodes + int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS + q.push(make_pair(actor_id, 0)); // This vertex, which is at distance 0 + enqueued[actor_id] = true; + bool skip = false; + while (!q.empty()) { + auto [bfs_actor_id, distance] = q.front(); // Prendo l'elemento in cima alla coda + q.pop(); + // Try to set a lower bound on the farness + if (distance > prev_distance) { + top_actors_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it + if (top_actors.size() == k) { // We are in the first item of the next exploration level + // We assume r = A.size(), the maximum possible value + double farness_lower_bound = 1.0 / ((double)A.size() - 1) * (sum_distances + q.size() * distance); + if (top_actors[k-1].second <= farness_lower_bound) { // Stop the BFS + skip = true; + top_actors_mutex.unlock(); // Release the ownership + break; + } + } + top_actors_mutex.unlock(); // Release the ownership + } + // We compute the farness of our vertex actor_id + r++; + sum_distances += distance; + // We loop on each actor on each film that bfs_actor_id played in, and add them to the queue + for (int bfs_film_id : A[bfs_actor_id].film_indices) { + for (int adj_actor_id : F[bfs_film_id].actor_indicies) { + if (!enqueued[adj_actor_id]) { + // The adjacent vertices have distance +1 with respect to the current vertex + q.push(make_pair(adj_actor_id, distance+1)); + enqueued[adj_actor_id] = true; + } + } + } + } + if (skip) { + cout << actor_id << " " << A[actor_id].name << " SKIPPED" << endl; + continue; + } + // BFS is over, we compute the farness + double farness; + if (r <= 1) // Avoid computing something/0 + farness = numeric_limits::infinity(); + else + farness = (double)(A.size()-1) / pow((double)r-1, 2) * (double)sum_distances; + + top_actors_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it + // Insert the actor in top_actors, before the first element with farness >= than our actor's (i.e. sorted insertion) + auto index = find_if(top_actors.begin(), top_actors.end(), + [&farness](const pair& p) { return p.second > farness; }); + top_actors.insert(index, make_pair(actor_id, farness)); + if (top_actors.size() > k) + top_actors.pop_back(); + top_actors_mutex.unlock(); // Release the ownerhsip (we are done with top_actors) + + cout << actor_id << " " << A[actor_id].name << "\n\tCC: " << 1.0/farness << endl; + // top_actors_lock gets destroyed after this line, releasing the mutex + } + }, i)); + } + + for (auto& thread : threads) + // Waiting for all threads to finish + thread.join(); + + ofstream output_file("visualization/top_actors_c.txt"); + for (const auto& [actor_id, farness] : top_actors) { + output_file << actor_id << "\t" << 1.0/farness << endl; + } + + return top_actors; + +} + +vector> harmonic(const size_t k) { // + + vector> top_actors; // Each pair is (actor_index, harmonic centrality). + top_actors.reserve(k+1); // We need exactly k items, no more and no less. + + vector threads; + mutex top_actors_mutex; // To prevent simultaneous accesses to top_actors + threads.reserve(N_THREADS); + for (int i = 0; i < N_THREADS; i++) { + threads.push_back(thread([&top_actors,&top_actors_mutex,&k](int start) { + vector enqueued(MAX_ACTOR_ID, false); // Vector to see which vertices with put in the queue during the BSF + // We loop over each vertex + for (int actor_id = start; actor_id <= MAX_ACTOR_ID; actor_id += N_THREADS) { + if (!A.count(actor_id)) // The actor must exist, otherwise A[actor_id] would attempt to write A, and this may produce a race condition if multiple threads do it at the same time + continue; + // if |Top| ≥ k and L[v] > Farn[Top[k]] then return Top; => We can not exploit the lower bound of our vertex to stop the loop, as we are not updating lower bounds L. + // We just compute the farness of our vertex using a BFS + queue> q; // FIFO of pairs (actor_index, distance from our vertex). + for (size_t i = 0; i < enqueued.size(); i++) + enqueued[i] = false; + int r = 0; // |R|, where R is the set of vertices reachable from our vertex + double sum_reverse_distances = 0; // Sum of the distances to other nodes + int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS + q.push(make_pair(actor_id, 0)); + enqueued[actor_id] = true; + bool skip = false; + while (!q.empty()) { + auto [bfs_actor_id, distance] = q.front(); + q.pop(); + // Try to set an upper bound on the centrality + if (distance > prev_distance) { + top_actors_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it + if (top_actors.size() == k) { // We are in the first item of the next exploration level + double harmonic_centrality_upper_bound = sum_reverse_distances + q.size() / (double)distance + (A.size() - r - q.size()) / (double)(distance + 1); + if (top_actors[k-1].second >= harmonic_centrality_upper_bound) { // Stop the BFS + skip = true; + top_actors_mutex.unlock(); // Release the ownership + break; + } + } + top_actors_mutex.unlock(); // Release the ownership + } + // We compute the farness of our vertex actor_id + r++; + if (distance != 0) + sum_reverse_distances += 1.0/distance; + // We loop on the adjacencies of bfs_actor_id and add them to the queue + for (int bfs_film_id : A[bfs_actor_id].film_indices) { + for (int adj_actor_id : F[bfs_film_id].actor_indicies) { + if (!enqueued[adj_actor_id]) { + // The adjacent vertices have distance +1 with respect to the current vertex + q.push(make_pair(adj_actor_id, distance+1)); + enqueued[adj_actor_id] = true; + } + } + } + } + if (skip) { + cout << actor_id << " " << A[actor_id].name << " SKIPPED" << endl; + continue; + } + // BFS is over, we compute the centrality + double harmonic_centrality = sum_reverse_distances; + if (!isfinite(harmonic_centrality)) + continue; + + top_actors_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it + // Insert the actor in top_actors, before the first element with farness >= than our actor's (i.e. sorted insertion) + auto index = find_if(top_actors.begin(), top_actors.end(), + [&harmonic_centrality](const pair& p) { return p.second < harmonic_centrality; }); + top_actors.insert(index, make_pair(actor_id, harmonic_centrality)); + if (top_actors.size() > k) + top_actors.pop_back(); + cout << actor_id << " " << A[actor_id].name << "\n\tHC: " << harmonic_centrality << endl; + top_actors_mutex.unlock(); // Release the ownership + } + }, i)); + } + + for (auto& thread : threads) + thread.join(); + + ofstream output_file("visualization/top_actors_h.txt"); + for (const auto& [actor_id, harmonic] : top_actors) { + output_file << actor_id << "\t" << harmonic << endl; + } + + + return top_actors; +} + + +int main() +{ + srand(time(NULL)); + + DataRead(); + BuildGraph(); + cout << "Numero film: " << F.size() << endl; + cout << "Numero attori: " << A.size() << endl; + PrintGraph(); + + // ------------------------------------------------------------- // + + // FUNZIONE CERCA FILM + + // cout << "Cerca film: "; + // string titolo; + // getline(cin, titolo); + // int id_film = FindFilm(titolo); + // cout << id_film << "(" << F[id_film].name << ")"; + // if (!F[id_film].actor_indicies.empty()) { + // cout << ":"; + // for (int id_attore : F[id_film].actor_indicies) + // cout << " " << id_attore << "(" << A[id_attore].name << ")"; + // } + // cout << endl; + + // // FUNZIONE CERCA ATTORE + + // cout << "Cerca attore: "; + // string attore; + // getline(cin, attore); + // int id_attore = FindActor(attore); + // cout << id_attore << "(" << A[id_attore].name << ")"; + // if (!A[id_attore].film_indices.empty()) { + // cout << ":"; + // for (int id_attore : A[id_attore].film_indices) + // cout << " " << id_attore << "(" << F[id_film].name << ")"; // Non worka ancora + // } + // cout << endl; + + // ------------------------------------------------------------- // + + cout << "Grafo, grafo delle mie brame... chi è il più centrale del reame?\n" < +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // getline +#include // find +#include // ceil +#include + +using namespace std; + +struct Film { + string name; + vector actor_indicies; +}; + +struct Actor { + string name; + vector film_indices; +}; + +map A; // Dictionary {actor_id (key): Actor (value)} +map F; // Dictionary {film_id (key): Film (value)} +int MAX_MOVIE_ID = -1; // Here DataRead() puts the larges actor_id loaded from Attori.txt + +const int N_THREADS = 12; // Number of threads to use for some functions + +void DataRead() +{ + ifstream actors("../data/data_movie_graph/Attori.txt"); // read the file + ifstream movies("../data/data_movie_graph/FilmFiltrati.txt"); // read the file + + string s,t; + const string space /* the final frontier */ = "\t"; + + for (int i = 1; getline(actors,s); i++) + { + if (s.empty()) // jumps empty lines, sometimes can happen + continue; + try { + Actor TmpObj; // Temporary object for the actor class + int id = stoi(s.substr(0, s.find(space))); + TmpObj.name = s.substr(s.find(space)+1); + A[id] = TmpObj; // Matlab/Python notation, works since C++17 + + } catch (...) { + cout << "Could not read the line " << i << " of Actors file" << endl; + } + } + + for (int i = 1; getline(movies,t); i++) + { + if (t.empty()) + continue; + + try{ + Film TmpObj; + int id = stoi(t.substr(0, t.find(space))); + TmpObj.name = t.substr(t.find(space)+1); + F[id] = TmpObj; + if (id > MAX_MOVIE_ID) + MAX_MOVIE_ID = id; + } catch (...) { + cout << "Could not read the line " << i << " of Film file" << endl; + } + } +} + +void BuildGraph() +{ + ifstream relations("data/Relazioni.txt"); + string s; + const string space = "\t"; + + for (int i=1; getline(relations,s); i++){ // Scorro relations + if (s.empty()) + continue; + try { + int id_film = stoi(s.substr(0, s.find(space))); // Index of the movie + int id_attore = stoi(s.substr(s.find(space)+1)); // Index of the actor + if (A.count(id_attore) && F.count(id_film)) { // Do not consider the filtered ones + A[id_attore].film_indices.push_back(id_film); + F[id_film].actor_indicies.push_back(id_attore); + } + } catch (...) { + cout << "Could not read the line " << i << " of Releations file" << endl; + } + } +} + +void PrintGraph(size_t max_n_movie = 200) +{ + const size_t n = min(max_n_movie, F.size()); // There could be less film than max actors! + size_t i = 0; + for (const auto& [id_film, film] : F) { + cout << id_film << " (" << film.name << ")"; + if (!film.actor_indicies.empty()) { + cout << ":\n"; + for (int id_attore : film.actor_indicies) { + cout << "\t- " << id_attore << " (" << A[id_attore].name << ")\n"; + } + } + cout << endl; + + i++; // Taking count of how many are getting printed + if (i >= n) // Stop when I arrive ad n + break; + } +} + +// Find a movie by the title. Gives -1 if there is no match +int FindFilm(string title) +{ + for (const auto& [id, film] : F) + if (film.name == title) + return id; + return -1; +} + +// Find an actor by the name. Gives -1 if there is no match +int FindActor(string name) +{ + for (const auto& [id, actor] : A) + if (actor.name == name) + return id; + return -1; +} + +vector> closeness(const size_t k) { + + vector> top_movies; // Each pair is (movie_index, farness). + top_movies.reserve(k+1); // We need exactly k items, no more and no less. + + vector threads; + mutex top_movies_mutex; // The threads write to top_movies, so another thread reading top_movies at the same time may find it in an invalid state (if the read happens while the other thread is still writing) + threads.reserve(N_THREADS); + for (int i = 0; i < N_THREADS; i++) { + // Launching the threads + threads.push_back(thread([&top_movies,&top_movies_mutex,&k](int start) { + vector enqueued(MAX_MOVIE_ID, false); // Vector to see which vertices with put in the queue during the BSF + // We loop over each vertex + for (int film_id = start; film_id <= MAX_MOVIE_ID; film_id += N_THREADS) { + if (!F.count(film_id)) // The movie must exist, otherwise F[film_id] would attempt to write F, and this may produce a race condition if multiple threads do it at the same time + continue; + + // We just compute the farness of our vertex using a BFS + queue> q; // FIFO of pairs (film_index, distance from our vertex). + for (size_t i = 0; i < enqueued.size(); i++) + enqueued[i] = false; + int r = 0; // |R|, where R is the set of vertices reachable from our vertex + long long int sum_distances = 0; // Sum of the distances to other nodes + int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS + q.push(make_pair(film_id, 0)); // This vertex, which is at distance 0 + enqueued[film_id] = true; + bool skip = false; + while (!q.empty()) { + auto [bfs_film_id, distance] = q.front(); // Prendo l'elemento in cima alla coda + q.pop(); + // Try to set a lower bound on the farness + if (distance > prev_distance) { + top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it + if (top_movies.size() == k) { // We are in the first item of the next exploration level + // We assume r = A.size(), the maximum possible value + double farness_lower_bound = 1.0 / ((double)F.size() - 1) * (sum_distances + q.size() * distance); + //cout << "LB: \x1b[36m" << farness_lower_bound << "\x1b[0m" << endl; + if (top_movies[k-1].second <= farness_lower_bound) { // Stop the BFS + skip = true; + top_movies_mutex.unlock(); // Release the ownership + break; + } + } + top_movies_mutex.unlock(); // Release the ownership + } + // We compute the farness of our vertex actor_id + r++; + sum_distances += distance; + // We loop on each actor on each film that bfs_actor_id played in, and add them to the queue + for (int bfs_actor_id : F[bfs_film_id].actor_indicies) { + for (int adj_film_id : A[bfs_actor_id].film_indices) { + if (!enqueued[adj_film_id]) { + // The adjacent vertices have distance +1 with respect to the current vertex + q.push(make_pair(adj_film_id, distance+1)); + enqueued[adj_film_id] = true; + } + } + } + } + if (skip) { + cout << film_id << " " << F[film_id].name << " SKIPPED" << endl; + continue; + } + // BFS is over, we compute the farness + double farness; + if (r <= 1) // Avoid computing something/0 + farness = numeric_limits::infinity(); + else + farness = (double)(F.size()-1) / pow((double)r-1, 2) * (double)sum_distances; + + top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it + // Insert the actor in top_movies, before the first element with farness >= than our actor's (i.e. sorted insertion) + auto index = find_if(top_movies.begin(), top_movies.end(), + [&farness](const pair& p) { return p.second > farness; }); + top_movies.insert(index, make_pair(film_id, farness)); + if (top_movies.size() > k) + top_movies.pop_back(); + top_movies_mutex.unlock(); // Release the ownerhsip (we are done with top_movies) + + cout << film_id << " " << F[film_id].name << "\n\t\x1b[36m"CC: " << 1.0/farness "<< endl; + // top_actors_lock gets destroyed after this line, releasing the mutex + } + }, i)); + } + + for (auto& thread : threads) + // Waiting for all threads to finish + thread.join(); + + ofstream output_file("../visualization/movie_graph/data/top_movies_c.txt"); + for (const auto& [film_id, farness] : top_movies) { + output_file << film_id << "\t" << 1.0/farness << endl; + } + + return top_movies; + +} + +vector> harmonic(const size_t k) { // + + vector> top_movies; // Each pair is (actor_index, harmonic centrality). + top_movies.reserve(k+1); // We need exactly k items, no more and no less. + + vector threads; + mutex top_movies_mutex; // To prevent simultaneous accesses to top_movies + threads.reserve(N_THREADS); + for (int i = 0; i < N_THREADS; i++) { + threads.push_back(thread([&top_movies,&top_movies_mutex,&k](int start) { + vector enqueued(MAX_MOVIE_ID, false); // Vector to see which vertices with put in the queue during the BSF + // We loop over each vertex + for (int film_id = start; film_id <= MAX_MOVIE_ID; film_id += N_THREADS) { + if (!F.count(film_id)) // The actor must exist, otherwise A[actor_id] would attempt to write A, and this may produce a race condition if multiple threads do it at the same time + continue; + // if |Top| ≥ k and L[v] > Farn[Top[k]] then return Top; => We can not exploit the lower bound of our vertex to stop the loop, as we are not updating lower bounds L. + // We just compute the farness of our vertex using a BFS + queue> q; // FIFO of pairs (actor_index, distance from our vertex). + for (size_t i = 0; i < enqueued.size(); i++) + enqueued[i] = false; + int r = 0; // |R|, where R is the set of vertices reachable from our vertex + double sum_reverse_distances = 0; // Sum of the distances to other nodes + int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS + q.push(make_pair(film_id, 0)); + enqueued[film_id] = true; + bool skip = false; + while (!q.empty()) { + auto [bfs_film_id, distance] = q.front(); + q.pop(); + // Try to set an upper bound on the centrality + if (distance > prev_distance) { + top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it + if (top_movies.size() == k) { // We are in the first item of the next exploration level + double harmonic_centrality_upper_bound = sum_reverse_distances + q.size() / (double)distance + (F.size() - r - q.size()) / (double)(distance + 1); + if (top_movies[k-1].second >= harmonic_centrality_upper_bound) { // Stop the BFS + skip = true; + top_movies_mutex.unlock(); // Release the ownership + break; + } + } + top_movies_mutex.unlock(); // Release the ownership + } + // We compute the farness of our vertex actor_id + r++; + if (distance != 0) + sum_reverse_distances += 1.0/distance; + // We loop on the adjacencies of bfs_actor_id and add them to the queue + for (int bfs_actor_id : F[bfs_film_id].actor_indicies) { + for (int adj_film_id : A[bfs_actor_id].film_indices) { + if (!enqueued[adj_film_id]) { + // The adjacent vertices have distance +1 with respect to the current vertex + q.push(make_pair(adj_film_id, distance+1)); + enqueued[adj_film_id] = true; + } + } + } + } + if (skip) { + cout << film_id << " " << F[film_id].name << " SKIPPED" << endl; + continue; + } + // BFS is over, we compute the centrality + double harmonic_centrality = sum_reverse_distances; + if (!isfinite(harmonic_centrality)) + continue; + + top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it + // Insert the actor in top_movies, before the first element with farness >= than our actor's (i.e. sorted insertion) + auto index = find_if(top_movies.begin(), top_movies.end(), + [&harmonic_centrality](const pair& p) { return p.second < harmonic_centrality; }); + top_movies.insert(index, make_pair(film_id, harmonic_centrality)); + if (top_movies.size() > k) + top_movies.pop_back(); + cout << film_id << " " << F[film_id].name << "\n\tHC: " << harmonic_centrality << endl; + top_movies_mutex.unlock(); // Release the ownership + } + }, i)); + } + + for (auto& thread : threads) + thread.join(); + + ofstream output_file("../visualization/movie_graph/data/top_movies_h.txt"); + for (const auto& [film_id, harmonic] : top_movies) { + output_file << film_id << "\t" << harmonic << endl; + } + + + return top_movies; +} + + +int main() +{ + srand(time(NULL)); + + DataRead(); + BuildGraph(); + cout << "Numero film: " << F.size() << endl; + cout << "Numero attori: " << A.size() << endl; + PrintGraph(); + + // ------------------------------------------------------------- // + + // FUNZIONE CERCA FILM + + // cout << "Cerca film: "; + // string titolo; + // getline(cin, titolo); + // int id_film = FindFilm(titolo); + // cout << id_film << "(" << F[id_film].name << ")"; + // if (!F[id_film].actor_indicies.empty()) { + // cout << ":"; + // for (int id_attore : F[id_film].actor_indicies) + // cout << " " << id_attore << "(" << A[id_attore].name << ")"; + // } + // cout << endl; + + // // FUNZIONE CERCA ATTORE + + // cout << "Cerca attore: "; + // string attore; + // getline(cin, attore); + // int id_attore = FindActor(attore); + // cout << id_attore << "(" << A[id_attore].name << ")"; + // if (!A[id_attore].film_indices.empty()) { + // cout << ":"; + // for (int id_attore : A[id_attore].film_indices) + // cout << " " << id_attore << "(" << F[id_film].name << ")"; // Non worka ancora + // } + // cout << endl; + + // ------------------------------------------------------------- // + + cout << "Grafo, grafo delle mie brame... chi è il più centrale del reame?\n" <