created a new graph over the movies, riorganized foledrs

main
Luca Lombardo 3 years ago
parent 515a7d19eb
commit a07533193e

@ -0,0 +1,86 @@
#!/usr/bin/env python3
import gzip
import requests
import pandas as pd
import numpy as np
import os
import csv
MIN_MOVIES = 42 # Only keep relations for actors that have made more than this many movies
#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------#
def colored(r, g, b, text):
return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)
def download_url(url):
print("Downloading:", url)
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
if os.path.isfile(file_name):
print(colored(0,170,0,"Already downloaded: skipping"))
return
r = requests.get(url, stream=True)
r.raise_for_status()
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=4096):
f.write(chunk)
return url
urls = ["https://datasets.imdbws.com/name.basics.tsv.gz",
"https://datasets.imdbws.com/title.principals.tsv.gz",
"https://datasets.imdbws.com/title.basics.tsv.gz",
"https://datasets.imdbws.com/title.ratings.tsv.gz"]
for url in urls:
download_url(url)
os.makedirs("../data/data_actor_graph", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists
#------------------------------FILTERING------------------------------#
print("Filtering actors...")
df_attori = pd.read_csv(
'name.basics.tsv.gz', sep='\t', compression='gzip',
usecols=['nconst', 'primaryName', 'primaryProfession'], # Considering only this columns
dtype={'primaryName': 'U', 'primaryProfession': 'U'}, # Both are unsigned integers
converters={'nconst': lambda x: int(x.lstrip("nm0"))}) # All actors starts with nm0, we are just cleaning the output
df_attori.query('primaryProfession.str.contains("actor") or primaryProfession.str.contains("actress")', inplace=True)
# A lot of actors/actresses do more than one job (director etc..), with this comand I take all the names that have the string "actor" or "actress" in their profession. In this way both someone who is classified as "actor" or as "actor, director" are taken into consideration
print("Filtering films...")
df_film = pd.read_csv(
'title.basics.tsv.gz', sep='\t', compression='gzip',
usecols=['tconst', 'primaryTitle', 'isAdult', 'titleType'], # Considering only this columns
dtype={'primaryTitle': 'U', 'titleType': 'U'}, # Both are unsigned integers
converters={'tconst': lambda x: int(x.lstrip("t0")), 'isAdult': lambda x: x != "0"}) # # All movies starts with t0, we are just cleaning the output. Then remove all adult movies
df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]',
inplace=True) # There are a lot of junk categories considered in IMDb, we are considering all the non Adult movies in this whitelist
filtered_tconsts = df_film["tconst"].to_list()
print("Filtering relations...")
df_relazioni = pd.read_csv(
'title.principals.tsv.gz', sep='\t', compression='gzip',
usecols=['tconst', 'nconst','category'], # Considering only this columns
dtype={'category': 'U'}, # Unsigned integer
converters={'nconst': lambda x: int(x.lstrip("nm0")), 'tconst': lambda x: int(x.lstrip("t0"))}) # Cleaning
df_relazioni.query('(category == "actor" or category == "actress") and tconst in @filtered_tconsts', inplace=True)
# Returns an array of unique actor ids (nconsts) and an array of how many times they appear (counts) => the number of movies they appear in
nconsts, counts = np.unique(df_relazioni["nconst"].to_numpy(), return_counts=True)
filtered_nconsts = nconsts[counts>=MIN_MOVIES]
df_relazioni.query("nconst in @filtered_nconsts", inplace=True)
# Now select only films and actors that have at lest a relation
print("Re-filtering actors...")
nconsts_with_relations = df_relazioni["nconst"].unique()
df_attori.query("nconst in @nconsts_with_relations", inplace=True)
print("Re-filtering films...")
tconsts_with_relations = df_relazioni["tconst"].unique()
df_film.query("tconst in @tconsts_with_relations", inplace=True)
# Write the filtered files
df_attori.to_csv('../data/data_actor_graph/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['nconst', 'primaryName'], header=False, index=False)
df_film.to_csv('../data/data_actor_graph/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False)
df_relazioni.to_csv('../data/data_actor_graph/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False)

@ -0,0 +1,82 @@
#!/usr/bin/env python3
import gzip
import requests
import pandas as pd
import numpy as np
import os
import csv
#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------#
def colored(r, g, b, text):
return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)
def download_url(url):
print("Downloading:", url)
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
if os.path.isfile(file_name):
print(colored(0,170,0,"Already downloaded: skipping"))
return
r = requests.get(url, stream=True)
r.raise_for_status()
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=4096):
f.write(chunk)
return url
urls = ["https://datasets.imdbws.com/name.basics.tsv.gz",
"https://datasets.imdbws.com/title.principals.tsv.gz",
"https://datasets.imdbws.com/title.basics.tsv.gz",
"https://datasets.imdbws.com/title.ratings.tsv.gz"]
for url in urls:
download_url(url)
os.makedirs("../data/data_movie_graph", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists
#------------------------------FILTERING------------------------------#
print("Filtering actors...")
df_attori = pd.read_csv(
'name.basics.tsv.gz', sep='\t', compression='gzip',
usecols=['nconst', 'primaryName', 'primaryProfession'],
dtype={'primaryName': 'U', 'primaryProfession': 'U'},
converters={'nconst': lambda x: int(x.lstrip("nm0"))})
df_attori.query('primaryProfession.str.contains("actor") or primaryProfession.str.contains("actress")', inplace=True)
print("Filtering movies...")
df_film = pd.read_csv(
'title.basics.tsv.gz', sep='\t', compression='gzip',
usecols=['tconst', 'primaryTitle', 'isAdult', 'titleType'], # Considering only this columns
dtype={'primaryTitle': 'U', 'titleType': 'U'}, # Both are unsigned integers
converters={'tconst': lambda x: int(x.lstrip("t0")), 'isAdult': lambda x: x != "0"}) # All movies starts with t0, we are just cleaning the output. Then remove all adult movies
df_ratings = pd.read_csv(
'title.ratings.tsv.gz', sep='\t', compression='gzip',
usecols=['tconst', 'numVotes'],
dtype={'numVotes': 'u8'}, # Unsigned integer
converters={'tconst': lambda x: int(x.lstrip("t0"))})
df_film = pd.merge(df_film, df_ratings, "left", on="tconst")
del df_ratings
df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]',
inplace=True)
VOTES_MEAN = df_film['numVotes'].mean()
df_film.query('numVotes > @VOTES_MEAN', inplace=True)
filtered_tconsts = df_film["tconst"].to_list()
print("Filtering relations...")
df_relazioni = pd.read_csv(
'title.principals.tsv.gz', sep='\t', compression='gzip',
usecols=['tconst', 'nconst','category'], # Considering only this columns
dtype={'category': 'U'}, # Unsigned integer
converters={'nconst': lambda x: int(x.lstrip("nm0")), 'tconst': lambda x: int(x.lstrip("t0"))}) # Cleaning
df_relazioni.query('(category == "actor" or category == "actress") and tconst in @filtered_tconsts', inplace=True)
# Write the filtered files
df_attori.to_csv('../data/data_movie_graph/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['nconst', 'primaryName'], header=False, index=False)
df_film.to_csv('../data/data_movie_graph/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False)
df_relazioni.to_csv('../data/data_movie_graph/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False)

@ -0,0 +1,388 @@
// g++ -Wall -pedantic -std=c++17 -Ofast -pthread kenobi.cpp -o kenobi
#include <iostream>
#include <iomanip>
#include <vector>
#include <map>
#include <string>
#include <queue>
#include <list>
#include <thread>
#include <mutex>
#include <stack>
#include <set>
#include <fstream> // getline
#include <algorithm> // find
#include <math.h> // ceil
#include <sys/time.h>
using namespace std;
struct Film {
string name;
vector<int> actor_indicies;
};
struct Actor {
string name;
vector<int> film_indices;
};
map<int, Actor> A; // Dictionary {actor_id (key): Actor (value)}
map<int, Film> F; // Dictionary {film_id (value): Film (value)}
int MAX_ACTOR_ID = -1; // Here DataRead() puts the larges actor_id loaded from Attori.txt
const int N_THREADS = 12; // Number of threads to use for some functions
void DataRead()
{
ifstream actors("../data/data_actor_graph/Attori.txt"); // read the file
ifstream movies("../data/data_actor_graph/FilmFiltrati.txt"); // read the file
string s,t;
const string space /* the final frontier */ = "\t";
for (int i = 1; getline(actors,s); i++)
{
if (s.empty()) // jumps empty lines, sometimes can happen
continue;
try {
Actor TmpObj; // Temporary object for the actor class
int id = stoi(s.substr(0, s.find(space)));
TmpObj.name = s.substr(s.find(space)+1);
A[id] = TmpObj; // Matlab/Python notation, works since C++17
if (id > MAX_ACTOR_ID)
MAX_ACTOR_ID = id;
} catch (...) {
cout << "Could not read the line " << i << " of Actors file" << endl;
}
}
for (int i = 1; getline(movies,t); i++)
{
if (t.empty())
continue;
try{
Film TmpObj;
int id = stoi(t.substr(0, t.find(space)));
TmpObj.name = t.substr(t.find(space)+1);
F[id] = TmpObj;
} catch (...) {
cout << "Could not read the line " << i << " of Film file" << endl;
}
}
}
void BuildGraph()
{
ifstream relations("data/Relazioni.txt");
string s;
const string space = "\t";
for (int i=1; getline(relations,s); i++){ // Scorro relations
if (s.empty())
continue;
try {
int id_film = stoi(s.substr(0, s.find(space))); // Index of the movie
int id_attore = stoi(s.substr(s.find(space)+1)); // Index of the actor
if (A.count(id_attore) && F.count(id_film)) { // Do not consider the filtered ones
A[id_attore].film_indices.push_back(id_film);
F[id_film].actor_indicies.push_back(id_attore);
}
} catch (...) {
cout << "Could not read the line " << i << " of Releations file" << endl;
}
}
}
void PrintGraph(size_t max_n_actors = 3)
{
const size_t n = min(max_n_actors, A.size()); // There could be less film than max actors!
size_t i = 0;
for (const auto& [id_attore, attore] : A) {
cout << id_attore << " (" << attore.name << ")";
if (!attore.film_indices.empty()) {
cout << ":\n";
for (int id_film : attore.film_indices) {
cout << "\t- " << id_film << " (" << F[id_film].name << ")\n";
for (int id_attore_adj : F[id_film].actor_indicies)
if (id_attore_adj != id_attore)
cout << "\t\t* " << id_attore_adj << " (" << A[id_attore_adj].name << ")\n";
}
}
cout << endl;
i++; // Taking count of how many are getting printed
if (i >= n) // Stop when I arrive ad n
break;
}
}
// Find a movie by the title. Gives -1 if there is no match
int FindFilm(string title)
{
for (const auto& [id, film] : F)
if (film.name == title)
return id;
return -1;
}
// Find an actor by the name. Gives -1 if there is no match
int FindActor(string name)
{
for (const auto& [id, actor] : A)
if (actor.name == name)
return id;
return -1;
}
vector<pair<int, double>> closeness(const size_t k) {
vector<pair<int, double>> top_actors; // Each pair is (actor_index, farness).
top_actors.reserve(k+1); // We need exactly k items, no more and no less.
vector<thread> threads;
mutex top_actors_mutex; // The threads write to top_actors, so another thread reading top_actors at the same time may find it in an invalid state (if the read happens while the other thread is still writing)
threads.reserve(N_THREADS);
for (int i = 0; i < N_THREADS; i++) {
// Launching the threads
threads.push_back(thread([&top_actors,&top_actors_mutex,&k](int start) {
vector<bool> enqueued(MAX_ACTOR_ID, false); // Vector to see which vertices with put in the queue during the BSF
// We loop over each vertex
for (int actor_id = start; actor_id <= MAX_ACTOR_ID; actor_id += N_THREADS) {
if (!A.count(actor_id)) // The actor must exist, otherwise A[actor_id] would attempt to write A, and this may produce a race condition if multiple threads do it at the same time
continue;
// We just compute the farness of our vertex using a BFS
queue<pair<int,int>> q; // FIFO of pairs (actor_index, distance from our vertex).
for (size_t i = 0; i < enqueued.size(); i++)
enqueued[i] = false;
int r = 0; // |R|, where R is the set of vertices reachable from our vertex
long long int sum_distances = 0; // Sum of the distances to other nodes
int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS
q.push(make_pair(actor_id, 0)); // This vertex, which is at distance 0
enqueued[actor_id] = true;
bool skip = false;
while (!q.empty()) {
auto [bfs_actor_id, distance] = q.front(); // Prendo l'elemento in cima alla coda
q.pop();
// Try to set a lower bound on the farness
if (distance > prev_distance) {
top_actors_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
if (top_actors.size() == k) { // We are in the first item of the next exploration level
// We assume r = A.size(), the maximum possible value
double farness_lower_bound = 1.0 / ((double)A.size() - 1) * (sum_distances + q.size() * distance);
if (top_actors[k-1].second <= farness_lower_bound) { // Stop the BFS
skip = true;
top_actors_mutex.unlock(); // Release the ownership
break;
}
}
top_actors_mutex.unlock(); // Release the ownership
}
// We compute the farness of our vertex actor_id
r++;
sum_distances += distance;
// We loop on each actor on each film that bfs_actor_id played in, and add them to the queue
for (int bfs_film_id : A[bfs_actor_id].film_indices) {
for (int adj_actor_id : F[bfs_film_id].actor_indicies) {
if (!enqueued[adj_actor_id]) {
// The adjacent vertices have distance +1 with respect to the current vertex
q.push(make_pair(adj_actor_id, distance+1));
enqueued[adj_actor_id] = true;
}
}
}
}
if (skip) {
cout << actor_id << " " << A[actor_id].name << " SKIPPED" << endl;
continue;
}
// BFS is over, we compute the farness
double farness;
if (r <= 1) // Avoid computing something/0
farness = numeric_limits<double>::infinity();
else
farness = (double)(A.size()-1) / pow((double)r-1, 2) * (double)sum_distances;
top_actors_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
// Insert the actor in top_actors, before the first element with farness >= than our actor's (i.e. sorted insertion)
auto index = find_if(top_actors.begin(), top_actors.end(),
[&farness](const pair<int, double>& p) { return p.second > farness; });
top_actors.insert(index, make_pair(actor_id, farness));
if (top_actors.size() > k)
top_actors.pop_back();
top_actors_mutex.unlock(); // Release the ownerhsip (we are done with top_actors)
cout << actor_id << " " << A[actor_id].name << "\n\tCC: " << 1.0/farness << endl;
// top_actors_lock gets destroyed after this line, releasing the mutex
}
}, i));
}
for (auto& thread : threads)
// Waiting for all threads to finish
thread.join();
ofstream output_file("visualization/top_actors_c.txt");
for (const auto& [actor_id, farness] : top_actors) {
output_file << actor_id << "\t" << 1.0/farness << endl;
}
return top_actors;
}
vector<pair<int, double>> harmonic(const size_t k) { //
vector<pair<int, double>> top_actors; // Each pair is (actor_index, harmonic centrality).
top_actors.reserve(k+1); // We need exactly k items, no more and no less.
vector<thread> threads;
mutex top_actors_mutex; // To prevent simultaneous accesses to top_actors
threads.reserve(N_THREADS);
for (int i = 0; i < N_THREADS; i++) {
threads.push_back(thread([&top_actors,&top_actors_mutex,&k](int start) {
vector<bool> enqueued(MAX_ACTOR_ID, false); // Vector to see which vertices with put in the queue during the BSF
// We loop over each vertex
for (int actor_id = start; actor_id <= MAX_ACTOR_ID; actor_id += N_THREADS) {
if (!A.count(actor_id)) // The actor must exist, otherwise A[actor_id] would attempt to write A, and this may produce a race condition if multiple threads do it at the same time
continue;
// if |Top| ≥ k and L[v] > Farn[Top[k]] then return Top; => We can not exploit the lower bound of our vertex to stop the loop, as we are not updating lower bounds L.
// We just compute the farness of our vertex using a BFS
queue<pair<int,int>> q; // FIFO of pairs (actor_index, distance from our vertex).
for (size_t i = 0; i < enqueued.size(); i++)
enqueued[i] = false;
int r = 0; // |R|, where R is the set of vertices reachable from our vertex
double sum_reverse_distances = 0; // Sum of the distances to other nodes
int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS
q.push(make_pair(actor_id, 0));
enqueued[actor_id] = true;
bool skip = false;
while (!q.empty()) {
auto [bfs_actor_id, distance] = q.front();
q.pop();
// Try to set an upper bound on the centrality
if (distance > prev_distance) {
top_actors_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
if (top_actors.size() == k) { // We are in the first item of the next exploration level
double harmonic_centrality_upper_bound = sum_reverse_distances + q.size() / (double)distance + (A.size() - r - q.size()) / (double)(distance + 1);
if (top_actors[k-1].second >= harmonic_centrality_upper_bound) { // Stop the BFS
skip = true;
top_actors_mutex.unlock(); // Release the ownership
break;
}
}
top_actors_mutex.unlock(); // Release the ownership
}
// We compute the farness of our vertex actor_id
r++;
if (distance != 0)
sum_reverse_distances += 1.0/distance;
// We loop on the adjacencies of bfs_actor_id and add them to the queue
for (int bfs_film_id : A[bfs_actor_id].film_indices) {
for (int adj_actor_id : F[bfs_film_id].actor_indicies) {
if (!enqueued[adj_actor_id]) {
// The adjacent vertices have distance +1 with respect to the current vertex
q.push(make_pair(adj_actor_id, distance+1));
enqueued[adj_actor_id] = true;
}
}
}
}
if (skip) {
cout << actor_id << " " << A[actor_id].name << " SKIPPED" << endl;
continue;
}
// BFS is over, we compute the centrality
double harmonic_centrality = sum_reverse_distances;
if (!isfinite(harmonic_centrality))
continue;
top_actors_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
// Insert the actor in top_actors, before the first element with farness >= than our actor's (i.e. sorted insertion)
auto index = find_if(top_actors.begin(), top_actors.end(),
[&harmonic_centrality](const pair<int, double>& p) { return p.second < harmonic_centrality; });
top_actors.insert(index, make_pair(actor_id, harmonic_centrality));
if (top_actors.size() > k)
top_actors.pop_back();
cout << actor_id << " " << A[actor_id].name << "\n\tHC: " << harmonic_centrality << endl;
top_actors_mutex.unlock(); // Release the ownership
}
}, i));
}
for (auto& thread : threads)
thread.join();
ofstream output_file("visualization/top_actors_h.txt");
for (const auto& [actor_id, harmonic] : top_actors) {
output_file << actor_id << "\t" << harmonic << endl;
}
return top_actors;
}
int main()
{
srand(time(NULL));
DataRead();
BuildGraph();
cout << "Numero film: " << F.size() << endl;
cout << "Numero attori: " << A.size() << endl;
PrintGraph();
// ------------------------------------------------------------- //
// FUNZIONE CERCA FILM
// cout << "Cerca film: ";
// string titolo;
// getline(cin, titolo);
// int id_film = FindFilm(titolo);
// cout << id_film << "(" << F[id_film].name << ")";
// if (!F[id_film].actor_indicies.empty()) {
// cout << ":";
// for (int id_attore : F[id_film].actor_indicies)
// cout << " " << id_attore << "(" << A[id_attore].name << ")";
// }
// cout << endl;
// // FUNZIONE CERCA ATTORE
// cout << "Cerca attore: ";
// string attore;
// getline(cin, attore);
// int id_attore = FindActor(attore);
// cout << id_attore << "(" << A[id_attore].name << ")";
// if (!A[id_attore].film_indices.empty()) {
// cout << ":";
// for (int id_attore : A[id_attore].film_indices)
// cout << " " << id_attore << "(" << F[id_film].name << ")"; // Non worka ancora
// }
// cout << endl;
// ------------------------------------------------------------- //
cout << "Grafo, grafo delle mie brame... chi è il più centrale del reame?\n" <<endl;
const size_t k = 100;
auto top_by_closeness = closeness(k);
auto top_by_harmonic = harmonic(k);
printf("\n%36s %36s\n", "CLOSENESS CENTRALITY", "HARMONIC CENTRALITY");
for (size_t i = 0; i < k; i++) {
const auto& [closeness_actor_id, farness] = top_by_closeness[i];
const auto& [centrality_actor_id, centrality] = top_by_harmonic[i];
printf("%25s : %8lg %25s : %8lg\n",
A[closeness_actor_id].name.c_str(), 1.0/farness,
A[centrality_actor_id].name.c_str(), centrality);
}
// for (const auto& [actor_id, farness] : top_by_closeness) {
// cout << A[actor_id].name << "\n\tCloseness Centrality: " << 1.0/farness << endl;
// }
// for (const auto& [actor_id, centrality] : top_by_harmonic) {
// cout << A[actor_id].name << "\n\tHarmonic Centrality: " << centrality << endl;
// }
}

@ -0,0 +1,387 @@
// g++ -Wall -pedantic -std=c++17 -Ofast -pthread movie_graph.cpp -o movie_graph
#include <iostream>
#include <iomanip>
#include <vector>
#include <map>
#include <string>
#include <queue>
#include <list>
#include <thread>
#include <mutex>
#include <stack>
#include <set>
#include <fstream> // getline
#include <algorithm> // find
#include <math.h> // ceil
#include <sys/time.h>
using namespace std;
struct Film {
string name;
vector<int> actor_indicies;
};
struct Actor {
string name;
vector<int> film_indices;
};
map<int, Actor> A; // Dictionary {actor_id (key): Actor (value)}
map<int, Film> F; // Dictionary {film_id (key): Film (value)}
int MAX_MOVIE_ID = -1; // Here DataRead() puts the larges actor_id loaded from Attori.txt
const int N_THREADS = 12; // Number of threads to use for some functions
void DataRead()
{
ifstream actors("../data/data_movie_graph/Attori.txt"); // read the file
ifstream movies("../data/data_movie_graph/FilmFiltrati.txt"); // read the file
string s,t;
const string space /* the final frontier */ = "\t";
for (int i = 1; getline(actors,s); i++)
{
if (s.empty()) // jumps empty lines, sometimes can happen
continue;
try {
Actor TmpObj; // Temporary object for the actor class
int id = stoi(s.substr(0, s.find(space)));
TmpObj.name = s.substr(s.find(space)+1);
A[id] = TmpObj; // Matlab/Python notation, works since C++17
} catch (...) {
cout << "Could not read the line " << i << " of Actors file" << endl;
}
}
for (int i = 1; getline(movies,t); i++)
{
if (t.empty())
continue;
try{
Film TmpObj;
int id = stoi(t.substr(0, t.find(space)));
TmpObj.name = t.substr(t.find(space)+1);
F[id] = TmpObj;
if (id > MAX_MOVIE_ID)
MAX_MOVIE_ID = id;
} catch (...) {
cout << "Could not read the line " << i << " of Film file" << endl;
}
}
}
void BuildGraph()
{
ifstream relations("data/Relazioni.txt");
string s;
const string space = "\t";
for (int i=1; getline(relations,s); i++){ // Scorro relations
if (s.empty())
continue;
try {
int id_film = stoi(s.substr(0, s.find(space))); // Index of the movie
int id_attore = stoi(s.substr(s.find(space)+1)); // Index of the actor
if (A.count(id_attore) && F.count(id_film)) { // Do not consider the filtered ones
A[id_attore].film_indices.push_back(id_film);
F[id_film].actor_indicies.push_back(id_attore);
}
} catch (...) {
cout << "Could not read the line " << i << " of Releations file" << endl;
}
}
}
void PrintGraph(size_t max_n_movie = 200)
{
const size_t n = min(max_n_movie, F.size()); // There could be less film than max actors!
size_t i = 0;
for (const auto& [id_film, film] : F) {
cout << id_film << " (" << film.name << ")";
if (!film.actor_indicies.empty()) {
cout << ":\n";
for (int id_attore : film.actor_indicies) {
cout << "\t- " << id_attore << " (" << A[id_attore].name << ")\n";
}
}
cout << endl;
i++; // Taking count of how many are getting printed
if (i >= n) // Stop when I arrive ad n
break;
}
}
// Find a movie by the title. Gives -1 if there is no match
int FindFilm(string title)
{
for (const auto& [id, film] : F)
if (film.name == title)
return id;
return -1;
}
// Find an actor by the name. Gives -1 if there is no match
int FindActor(string name)
{
for (const auto& [id, actor] : A)
if (actor.name == name)
return id;
return -1;
}
vector<pair<int, double>> closeness(const size_t k) {
vector<pair<int, double>> top_movies; // Each pair is (movie_index, farness).
top_movies.reserve(k+1); // We need exactly k items, no more and no less.
vector<thread> threads;
mutex top_movies_mutex; // The threads write to top_movies, so another thread reading top_movies at the same time may find it in an invalid state (if the read happens while the other thread is still writing)
threads.reserve(N_THREADS);
for (int i = 0; i < N_THREADS; i++) {
// Launching the threads
threads.push_back(thread([&top_movies,&top_movies_mutex,&k](int start) {
vector<bool> enqueued(MAX_MOVIE_ID, false); // Vector to see which vertices with put in the queue during the BSF
// We loop over each vertex
for (int film_id = start; film_id <= MAX_MOVIE_ID; film_id += N_THREADS) {
if (!F.count(film_id)) // The movie must exist, otherwise F[film_id] would attempt to write F, and this may produce a race condition if multiple threads do it at the same time
continue;
// We just compute the farness of our vertex using a BFS
queue<pair<int,int>> q; // FIFO of pairs (film_index, distance from our vertex).
for (size_t i = 0; i < enqueued.size(); i++)
enqueued[i] = false;
int r = 0; // |R|, where R is the set of vertices reachable from our vertex
long long int sum_distances = 0; // Sum of the distances to other nodes
int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS
q.push(make_pair(film_id, 0)); // This vertex, which is at distance 0
enqueued[film_id] = true;
bool skip = false;
while (!q.empty()) {
auto [bfs_film_id, distance] = q.front(); // Prendo l'elemento in cima alla coda
q.pop();
// Try to set a lower bound on the farness
if (distance > prev_distance) {
top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
if (top_movies.size() == k) { // We are in the first item of the next exploration level
// We assume r = A.size(), the maximum possible value
double farness_lower_bound = 1.0 / ((double)F.size() - 1) * (sum_distances + q.size() * distance);
//cout << "LB: \x1b[36m" << farness_lower_bound << "\x1b[0m" << endl;
if (top_movies[k-1].second <= farness_lower_bound) { // Stop the BFS
skip = true;
top_movies_mutex.unlock(); // Release the ownership
break;
}
}
top_movies_mutex.unlock(); // Release the ownership
}
// We compute the farness of our vertex actor_id
r++;
sum_distances += distance;
// We loop on each actor on each film that bfs_actor_id played in, and add them to the queue
for (int bfs_actor_id : F[bfs_film_id].actor_indicies) {
for (int adj_film_id : A[bfs_actor_id].film_indices) {
if (!enqueued[adj_film_id]) {
// The adjacent vertices have distance +1 with respect to the current vertex
q.push(make_pair(adj_film_id, distance+1));
enqueued[adj_film_id] = true;
}
}
}
}
if (skip) {
cout << film_id << " " << F[film_id].name << " SKIPPED" << endl;
continue;
}
// BFS is over, we compute the farness
double farness;
if (r <= 1) // Avoid computing something/0
farness = numeric_limits<double>::infinity();
else
farness = (double)(F.size()-1) / pow((double)r-1, 2) * (double)sum_distances;
top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
// Insert the actor in top_movies, before the first element with farness >= than our actor's (i.e. sorted insertion)
auto index = find_if(top_movies.begin(), top_movies.end(),
[&farness](const pair<int, double>& p) { return p.second > farness; });
top_movies.insert(index, make_pair(film_id, farness));
if (top_movies.size() > k)
top_movies.pop_back();
top_movies_mutex.unlock(); // Release the ownerhsip (we are done with top_movies)
cout << film_id << " " << F[film_id].name << "\n\t\x1b[36m"CC: " << 1.0/farness "<< endl;
// top_actors_lock gets destroyed after this line, releasing the mutex
}
}, i));
}
for (auto& thread : threads)
// Waiting for all threads to finish
thread.join();
ofstream output_file("../visualization/movie_graph/data/top_movies_c.txt");
for (const auto& [film_id, farness] : top_movies) {
output_file << film_id << "\t" << 1.0/farness << endl;
}
return top_movies;
}
vector<pair<int, double>> harmonic(const size_t k) { //
vector<pair<int, double>> top_movies; // Each pair is (actor_index, harmonic centrality).
top_movies.reserve(k+1); // We need exactly k items, no more and no less.
vector<thread> threads;
mutex top_movies_mutex; // To prevent simultaneous accesses to top_movies
threads.reserve(N_THREADS);
for (int i = 0; i < N_THREADS; i++) {
threads.push_back(thread([&top_movies,&top_movies_mutex,&k](int start) {
vector<bool> enqueued(MAX_MOVIE_ID, false); // Vector to see which vertices with put in the queue during the BSF
// We loop over each vertex
for (int film_id = start; film_id <= MAX_MOVIE_ID; film_id += N_THREADS) {
if (!F.count(film_id)) // The actor must exist, otherwise A[actor_id] would attempt to write A, and this may produce a race condition if multiple threads do it at the same time
continue;
// if |Top| ≥ k and L[v] > Farn[Top[k]] then return Top; => We can not exploit the lower bound of our vertex to stop the loop, as we are not updating lower bounds L.
// We just compute the farness of our vertex using a BFS
queue<pair<int,int>> q; // FIFO of pairs (actor_index, distance from our vertex).
for (size_t i = 0; i < enqueued.size(); i++)
enqueued[i] = false;
int r = 0; // |R|, where R is the set of vertices reachable from our vertex
double sum_reverse_distances = 0; // Sum of the distances to other nodes
int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS
q.push(make_pair(film_id, 0));
enqueued[film_id] = true;
bool skip = false;
while (!q.empty()) {
auto [bfs_film_id, distance] = q.front();
q.pop();
// Try to set an upper bound on the centrality
if (distance > prev_distance) {
top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
if (top_movies.size() == k) { // We are in the first item of the next exploration level
double harmonic_centrality_upper_bound = sum_reverse_distances + q.size() / (double)distance + (F.size() - r - q.size()) / (double)(distance + 1);
if (top_movies[k-1].second >= harmonic_centrality_upper_bound) { // Stop the BFS
skip = true;
top_movies_mutex.unlock(); // Release the ownership
break;
}
}
top_movies_mutex.unlock(); // Release the ownership
}
// We compute the farness of our vertex actor_id
r++;
if (distance != 0)
sum_reverse_distances += 1.0/distance;
// We loop on the adjacencies of bfs_actor_id and add them to the queue
for (int bfs_actor_id : F[bfs_film_id].actor_indicies) {
for (int adj_film_id : A[bfs_actor_id].film_indices) {
if (!enqueued[adj_film_id]) {
// The adjacent vertices have distance +1 with respect to the current vertex
q.push(make_pair(adj_film_id, distance+1));
enqueued[adj_film_id] = true;
}
}
}
}
if (skip) {
cout << film_id << " " << F[film_id].name << " SKIPPED" << endl;
continue;
}
// BFS is over, we compute the centrality
double harmonic_centrality = sum_reverse_distances;
if (!isfinite(harmonic_centrality))
continue;
top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
// Insert the actor in top_movies, before the first element with farness >= than our actor's (i.e. sorted insertion)
auto index = find_if(top_movies.begin(), top_movies.end(),
[&harmonic_centrality](const pair<int, double>& p) { return p.second < harmonic_centrality; });
top_movies.insert(index, make_pair(film_id, harmonic_centrality));
if (top_movies.size() > k)
top_movies.pop_back();
cout << film_id << " " << F[film_id].name << "\n\tHC: " << harmonic_centrality << endl;
top_movies_mutex.unlock(); // Release the ownership
}
}, i));
}
for (auto& thread : threads)
thread.join();
ofstream output_file("../visualization/movie_graph/data/top_movies_h.txt");
for (const auto& [film_id, harmonic] : top_movies) {
output_file << film_id << "\t" << harmonic << endl;
}
return top_movies;
}
int main()
{
srand(time(NULL));
DataRead();
BuildGraph();
cout << "Numero film: " << F.size() << endl;
cout << "Numero attori: " << A.size() << endl;
PrintGraph();
// ------------------------------------------------------------- //
// FUNZIONE CERCA FILM
// cout << "Cerca film: ";
// string titolo;
// getline(cin, titolo);
// int id_film = FindFilm(titolo);
// cout << id_film << "(" << F[id_film].name << ")";
// if (!F[id_film].actor_indicies.empty()) {
// cout << ":";
// for (int id_attore : F[id_film].actor_indicies)
// cout << " " << id_attore << "(" << A[id_attore].name << ")";
// }
// cout << endl;
// // FUNZIONE CERCA ATTORE
// cout << "Cerca attore: ";
// string attore;
// getline(cin, attore);
// int id_attore = FindActor(attore);
// cout << id_attore << "(" << A[id_attore].name << ")";
// if (!A[id_attore].film_indices.empty()) {
// cout << ":";
// for (int id_attore : A[id_attore].film_indices)
// cout << " " << id_attore << "(" << F[id_film].name << ")"; // Non worka ancora
// }
// cout << endl;
// ------------------------------------------------------------- //
cout << "Grafo, grafo delle mie brame... chi è il più centrale del reame?\n" <<endl;
const size_t k = 100;
auto top_by_closeness = closeness(k);
auto top_by_harmonic = harmonic(k);
printf("\n%36s %36s\n", "CLOSENESS CENTRALITY", "HARMONIC CENTRALITY");
for (size_t i = 0; i < k; i++) {
const auto& [closeness_film_id, farness] = top_by_closeness[i];
const auto& [centrality_film_id, centrality] = top_by_harmonic[i];
printf("%25s : %8lg %25s : %8lg\n",
F[closeness_film_id].name.c_str(), 1.0/farness,
F[centrality_film_id].name.c_str(), centrality);
}
// for (const auto& [actor_id, farness] : top_by_closeness) {
// cout << A[actor_id].name << "\n\tCloseness Centrality: " << 1.0/farness << endl;
// }
// for (const auto& [actor_id, centrality] : top_by_harmonic) {
// cout << A[actor_id].name << "\n\tHarmonic Centrality: " << centrality << endl;
// }
}
Loading…
Cancel
Save