You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

388 lines
17 KiB
C++

// g++ -Wall -pedantic -std=c++17 -Ofast -pthread movie_graph.cpp -o movie_graph
#include <iostream>
#include <iomanip>
#include <vector>
#include <map>
#include <string>
#include <queue>
#include <list>
#include <thread>
#include <mutex>
#include <stack>
#include <set>
#include <fstream> // getline
#include <algorithm> // find
#include <math.h> // ceil
#include <sys/time.h>
using namespace std;
struct Film {
string name;
vector<int> actor_indicies;
};
struct Actor {
string name;
vector<int> film_indices;
};
map<int, Actor> A; // Dictionary {actor_id (key): Actor (value)}
map<int, Film> F; // Dictionary {film_id (key): Film (value)}
int MAX_MOVIE_ID = -1; // Here DataRead() puts the larges actor_id loaded from Attori.txt
const int N_THREADS = 12; // Number of threads to use for some functions
void DataRead()
{
ifstream actors("../data/data_movie_graph/Attori.txt"); // read the file
ifstream movies("../data/data_movie_graph/FilmFiltrati.txt"); // read the file
string s,t;
const string space /* the final frontier */ = "\t";
for (int i = 1; getline(actors,s); i++)
{
if (s.empty()) // jumps empty lines, sometimes can happen
continue;
try {
Actor TmpObj; // Temporary object for the actor class
int id = stoi(s.substr(0, s.find(space)));
TmpObj.name = s.substr(s.find(space)+1);
A[id] = TmpObj; // Matlab/Python notation, works since C++17
} catch (...) {
cout << "Could not read the line " << i << " of Actors file" << endl;
}
}
for (int i = 1; getline(movies,t); i++)
{
if (t.empty())
continue;
try{
Film TmpObj;
int id = stoi(t.substr(0, t.find(space)));
TmpObj.name = t.substr(t.find(space)+1);
F[id] = TmpObj;
if (id > MAX_MOVIE_ID)
MAX_MOVIE_ID = id;
} catch (...) {
cout << "Could not read the line " << i << " of Film file" << endl;
}
}
}
void BuildGraph()
{
ifstream relations("data/Relazioni.txt");
string s;
const string space = "\t";
for (int i=1; getline(relations,s); i++){ // Scorro relations
if (s.empty())
continue;
try {
int id_film = stoi(s.substr(0, s.find(space))); // Index of the movie
int id_attore = stoi(s.substr(s.find(space)+1)); // Index of the actor
if (A.count(id_attore) && F.count(id_film)) { // Do not consider the filtered ones
A[id_attore].film_indices.push_back(id_film);
F[id_film].actor_indicies.push_back(id_attore);
}
} catch (...) {
cout << "Could not read the line " << i << " of Releations file" << endl;
}
}
}
void PrintGraph(size_t max_n_movie = 200)
{
const size_t n = min(max_n_movie, F.size()); // There could be less film than max actors!
size_t i = 0;
for (const auto& [id_film, film] : F) {
cout << id_film << " (" << film.name << ")";
if (!film.actor_indicies.empty()) {
cout << ":\n";
for (int id_attore : film.actor_indicies) {
cout << "\t- " << id_attore << " (" << A[id_attore].name << ")\n";
}
}
cout << endl;
i++; // Taking count of how many are getting printed
if (i >= n) // Stop when I arrive ad n
break;
}
}
// Find a movie by the title. Gives -1 if there is no match
int FindFilm(string title)
{
for (const auto& [id, film] : F)
if (film.name == title)
return id;
return -1;
}
// Find an actor by the name. Gives -1 if there is no match
int FindActor(string name)
{
for (const auto& [id, actor] : A)
if (actor.name == name)
return id;
return -1;
}
vector<pair<int, double>> closeness(const size_t k) {
vector<pair<int, double>> top_movies; // Each pair is (movie_index, farness).
top_movies.reserve(k+1); // We need exactly k items, no more and no less.
vector<thread> threads;
mutex top_movies_mutex; // The threads write to top_movies, so another thread reading top_movies at the same time may find it in an invalid state (if the read happens while the other thread is still writing)
threads.reserve(N_THREADS);
for (int i = 0; i < N_THREADS; i++) {
// Launching the threads
threads.push_back(thread([&top_movies,&top_movies_mutex,&k](int start) {
vector<bool> enqueued(MAX_MOVIE_ID, false); // Vector to see which vertices with put in the queue during the BSF
// We loop over each vertex
for (int film_id = start; film_id <= MAX_MOVIE_ID; film_id += N_THREADS) {
if (!F.count(film_id)) // The movie must exist, otherwise F[film_id] would attempt to write F, and this may produce a race condition if multiple threads do it at the same time
continue;
// We just compute the farness of our vertex using a BFS
queue<pair<int,int>> q; // FIFO of pairs (film_index, distance from our vertex).
for (size_t i = 0; i < enqueued.size(); i++)
enqueued[i] = false;
int r = 0; // |R|, where R is the set of vertices reachable from our vertex
long long int sum_distances = 0; // Sum of the distances to other nodes
int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS
q.push(make_pair(film_id, 0)); // This vertex, which is at distance 0
enqueued[film_id] = true;
bool skip = false;
while (!q.empty()) {
auto [bfs_film_id, distance] = q.front(); // Prendo l'elemento in cima alla coda
q.pop();
// Try to set a lower bound on the farness
if (distance > prev_distance) {
top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
if (top_movies.size() == k) { // We are in the first item of the next exploration level
// We assume r = A.size(), the maximum possible value
double farness_lower_bound = 1.0 / ((double)F.size() - 1) * (sum_distances + q.size() * distance);
//cout << "LB: \x1b[36m" << farness_lower_bound << "\x1b[0m" << endl;
if (top_movies[k-1].second <= farness_lower_bound) { // Stop the BFS
skip = true;
top_movies_mutex.unlock(); // Release the ownership
break;
}
}
top_movies_mutex.unlock(); // Release the ownership
}
// We compute the farness of our vertex actor_id
r++;
sum_distances += distance;
// We loop on each actor on each film that bfs_actor_id played in, and add them to the queue
for (int bfs_actor_id : F[bfs_film_id].actor_indicies) {
for (int adj_film_id : A[bfs_actor_id].film_indices) {
if (!enqueued[adj_film_id]) {
// The adjacent vertices have distance +1 with respect to the current vertex
q.push(make_pair(adj_film_id, distance+1));
enqueued[adj_film_id] = true;
}
}
}
}
if (skip) {
cout << film_id << " " << F[film_id].name << " SKIPPED" << endl;
continue;
}
// BFS is over, we compute the farness
double farness;
if (r <= 1) // Avoid computing something/0
farness = numeric_limits<double>::infinity();
else
farness = (double)(F.size()-1) / pow((double)r-1, 2) * (double)sum_distances;
top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
// Insert the actor in top_movies, before the first element with farness >= than our actor's (i.e. sorted insertion)
auto index = find_if(top_movies.begin(), top_movies.end(),
[&farness](const pair<int, double>& p) { return p.second > farness; });
top_movies.insert(index, make_pair(film_id, farness));
if (top_movies.size() > k)
top_movies.pop_back();
top_movies_mutex.unlock(); // Release the ownerhsip (we are done with top_movies)
cout << film_id << " " << F[film_id].name << "\n\t\x1b[36m"CC: " << 1.0/farness "<< endl;
// top_actors_lock gets destroyed after this line, releasing the mutex
}
}, i));
}
for (auto& thread : threads)
// Waiting for all threads to finish
thread.join();
ofstream output_file("../visualization/movie_graph/data/top_movies_c.txt");
for (const auto& [film_id, farness] : top_movies) {
output_file << film_id << "\t" << 1.0/farness << endl;
}
return top_movies;
}
vector<pair<int, double>> harmonic(const size_t k) { //
vector<pair<int, double>> top_movies; // Each pair is (actor_index, harmonic centrality).
top_movies.reserve(k+1); // We need exactly k items, no more and no less.
vector<thread> threads;
mutex top_movies_mutex; // To prevent simultaneous accesses to top_movies
threads.reserve(N_THREADS);
for (int i = 0; i < N_THREADS; i++) {
threads.push_back(thread([&top_movies,&top_movies_mutex,&k](int start) {
vector<bool> enqueued(MAX_MOVIE_ID, false); // Vector to see which vertices with put in the queue during the BSF
// We loop over each vertex
for (int film_id = start; film_id <= MAX_MOVIE_ID; film_id += N_THREADS) {
if (!F.count(film_id)) // The actor must exist, otherwise A[actor_id] would attempt to write A, and this may produce a race condition if multiple threads do it at the same time
continue;
// if |Top| ≥ k and L[v] > Farn[Top[k]] then return Top; => We can not exploit the lower bound of our vertex to stop the loop, as we are not updating lower bounds L.
// We just compute the farness of our vertex using a BFS
queue<pair<int,int>> q; // FIFO of pairs (actor_index, distance from our vertex).
for (size_t i = 0; i < enqueued.size(); i++)
enqueued[i] = false;
int r = 0; // |R|, where R is the set of vertices reachable from our vertex
double sum_reverse_distances = 0; // Sum of the distances to other nodes
int prev_distance = 0; // Previous distance, to see when we get to a deeper level of the BFS
q.push(make_pair(film_id, 0));
enqueued[film_id] = true;
bool skip = false;
while (!q.empty()) {
auto [bfs_film_id, distance] = q.front();
q.pop();
// Try to set an upper bound on the centrality
if (distance > prev_distance) {
top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
if (top_movies.size() == k) { // We are in the first item of the next exploration level
double harmonic_centrality_upper_bound = sum_reverse_distances + q.size() / (double)distance + (F.size() - r - q.size()) / (double)(distance + 1);
if (top_movies[k-1].second >= harmonic_centrality_upper_bound) { // Stop the BFS
skip = true;
top_movies_mutex.unlock(); // Release the ownership
break;
}
}
top_movies_mutex.unlock(); // Release the ownership
}
// We compute the farness of our vertex actor_id
r++;
if (distance != 0)
sum_reverse_distances += 1.0/distance;
// We loop on the adjacencies of bfs_actor_id and add them to the queue
for (int bfs_actor_id : F[bfs_film_id].actor_indicies) {
for (int adj_film_id : A[bfs_actor_id].film_indices) {
if (!enqueued[adj_film_id]) {
// The adjacent vertices have distance +1 with respect to the current vertex
q.push(make_pair(adj_film_id, distance+1));
enqueued[adj_film_id] = true;
}
}
}
}
if (skip) {
cout << film_id << " " << F[film_id].name << " SKIPPED" << endl;
continue;
}
// BFS is over, we compute the centrality
double harmonic_centrality = sum_reverse_distances;
if (!isfinite(harmonic_centrality))
continue;
top_movies_mutex.lock(); // Acquire ownership of the mutex, wait if another thread already owns it
// Insert the actor in top_movies, before the first element with farness >= than our actor's (i.e. sorted insertion)
auto index = find_if(top_movies.begin(), top_movies.end(),
[&harmonic_centrality](const pair<int, double>& p) { return p.second < harmonic_centrality; });
top_movies.insert(index, make_pair(film_id, harmonic_centrality));
if (top_movies.size() > k)
top_movies.pop_back();
cout << film_id << " " << F[film_id].name << "\n\tHC: " << harmonic_centrality << endl;
top_movies_mutex.unlock(); // Release the ownership
}
}, i));
}
for (auto& thread : threads)
thread.join();
ofstream output_file("../visualization/movie_graph/data/top_movies_h.txt");
for (const auto& [film_id, harmonic] : top_movies) {
output_file << film_id << "\t" << harmonic << endl;
}
return top_movies;
}
int main()
{
srand(time(NULL));
DataRead();
BuildGraph();
cout << "Numero film: " << F.size() << endl;
cout << "Numero attori: " << A.size() << endl;
PrintGraph();
// ------------------------------------------------------------- //
// FUNZIONE CERCA FILM
// cout << "Cerca film: ";
// string titolo;
// getline(cin, titolo);
// int id_film = FindFilm(titolo);
// cout << id_film << "(" << F[id_film].name << ")";
// if (!F[id_film].actor_indicies.empty()) {
// cout << ":";
// for (int id_attore : F[id_film].actor_indicies)
// cout << " " << id_attore << "(" << A[id_attore].name << ")";
// }
// cout << endl;
// // FUNZIONE CERCA ATTORE
// cout << "Cerca attore: ";
// string attore;
// getline(cin, attore);
// int id_attore = FindActor(attore);
// cout << id_attore << "(" << A[id_attore].name << ")";
// if (!A[id_attore].film_indices.empty()) {
// cout << ":";
// for (int id_attore : A[id_attore].film_indices)
// cout << " " << id_attore << "(" << F[id_film].name << ")"; // Non worka ancora
// }
// cout << endl;
// ------------------------------------------------------------- //
cout << "Grafo, grafo delle mie brame... chi è il più centrale del reame?\n" <<endl;
const size_t k = 100;
auto top_by_closeness = closeness(k);
auto top_by_harmonic = harmonic(k);
printf("\n%36s %36s\n", "CLOSENESS CENTRALITY", "HARMONIC CENTRALITY");
for (size_t i = 0; i < k; i++) {
const auto& [closeness_film_id, farness] = top_by_closeness[i];
const auto& [centrality_film_id, centrality] = top_by_harmonic[i];
printf("%25s : %8lg %25s : %8lg\n",
F[closeness_film_id].name.c_str(), 1.0/farness,
F[centrality_film_id].name.c_str(), centrality);
}
// for (const auto& [actor_id, farness] : top_by_closeness) {
// cout << A[actor_id].name << "\n\tCloseness Centrality: " << 1.0/farness << endl;
// }
// for (const auto& [actor_id, centrality] : top_by_harmonic) {
// cout << A[actor_id].name << "\n\tHarmonic Centrality: " << centrality << endl;
// }
}