arg parse for movie filter

main
Luca Lombardo 3 years ago
parent 332e365fca
commit c3cc085163

@ -1,4 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse
import gzip import gzip
import requests import requests
import pandas as pd import pandas as pd
@ -7,6 +8,12 @@ import os
import csv import csv
#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------# #-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------#
parser = argparse.ArgumentParser()
parser.add_argument("--votes", type=int, required=True)
args = parser.parse_args()
def colored(r, g, b, text): def colored(r, g, b, text):
return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text) return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)
@ -33,7 +40,7 @@ urls = ["https://datasets.imdbws.com/name.basics.tsv.gz",
for url in urls: for url in urls:
download_url(url) download_url(url)
os.makedirs("../data/data_movie_graph", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists os.makedirs("../data/data_movie_graph_args", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists
#------------------------------FILTERING------------------------------# #------------------------------FILTERING------------------------------#
@ -61,7 +68,7 @@ df_film = pd.merge(df_film, df_ratings, "left", on="tconst")
del df_ratings del df_ratings
df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]', df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]',
inplace=True) inplace=True)
VOTES = df_film['numVotes'].mean() VOTES = args.votes
df_film.query('numVotes > @VOTES', inplace=True) df_film.query('numVotes > @VOTES', inplace=True)
filtered_tconsts = df_film["tconst"].to_list() filtered_tconsts = df_film["tconst"].to_list()

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
cd "$(dirname "$(realpath "$0")")" cd "$(dirname "$(realpath "$0")")"
for minmovies in 10 20 30 40 for minmovies in 5 10 20 30 40 50 60 70
do do
echo "##### STARTING FILTERING FOR MIN_MOVIES=$minmovies #####" echo "##### STARTING FILTERING FOR MIN_MOVIES=$minmovies #####"
cd ../../filters cd ../../filters

@ -33,6 +33,8 @@ int MAX_MOVIE_ID = -1; // Here DataRead() puts the larges actor_id loaded from A
const int N_THREADS = 12; // Number of threads to use for some functions const int N_THREADS = 12; // Number of threads to use for some functions
string outputFn;
void DataRead() void DataRead()
{ {
ifstream actors("../data/data_movie_graph/Attori.txt"); // read the file ifstream actors("../data/data_movie_graph/Attori.txt"); // read the file
@ -223,7 +225,8 @@ vector<pair<int, double>> closeness(const size_t k) {
// Waiting for all threads to finish // Waiting for all threads to finish
thread.join(); thread.join();
ofstream output_file("../visualization/movie_graph/data/top_movies_c.txt");
ofstream output_file(outputFn + "_c.txt");
for (const auto& [film_id, farness] : top_movies) { for (const auto& [film_id, farness] : top_movies) {
output_file << film_id << "\t" << F[film_id].name << "\t" << 1.0/farness << endl; output_file << film_id << "\t" << F[film_id].name << "\t" << 1.0/farness << endl;
} }
@ -314,18 +317,24 @@ vector<pair<int, double>> harmonic(const size_t k) { //
for (auto& thread : threads) for (auto& thread : threads)
thread.join(); thread.join();
ofstream output_file("../visualization/movie_graph/data/top_movies_h.txt"); ofstream output_file(outputFn + "_h.txt");
for (const auto& [film_id, harmonic] : top_movies) { for (const auto& [film_id, harmonic] : top_movies) {
output_file << film_id << "\t" << F[film_id].name << "\t" << harmonic << endl; output_file << film_id << "\t" << F[film_id].name << "\t" << harmonic << endl;
} }
return top_movies; return top_movies;
} }
int main() int main(int argc, char* argv[])
{ {
if (argc != 2) {
cout << "Usage: " << argv[0] << " OUTPUT_FILE_NAME" << endl;
exit(1);
}
outputFn = argv[1];
srand(time(NULL)); srand(time(NULL));
DataRead(); DataRead();

14
tex/analysis.tex vendored

@ -9,18 +9,18 @@ The first one will tell us how much more efficient the algorithm is in terms of
\nd The platform for the tests is \emph{a laptop}, so can not be considered precise due factors as thermal throttling. The CPU is an Intel(R) Core™ i7-8750H (6 cores, 12 threads), equipped with 16GB of DDR4 @2666 MHz RAM. \nd The platform for the tests is \emph{a laptop}, so can not be considered precise due factors as thermal throttling. The CPU is an Intel(R) Core™ i7-8750H (6 cores, 12 threads), equipped with 16GB of DDR4 @2666 MHz RAM.
\subsection{Actors graph} \subsection{Actors graph}
Let's take into analysis the graph were each actors is a node and two nodes are linked the if they played in a movie together. In the case, during the filtering, we created the variable \texttt{MINMOVIES}. This variable is the minimun number of movies that an actor/actress has to have done to be considered in the computation. Let's take into analysis the graph were each actors is a node and two nodes are linked the if they played in a movie together. In the case, during the filtering, we created the variable \texttt{MIN\textunderscore ACTORS}. This variable is the minimun number of movies that an actor/actress has to have done to be considered in the computation.
Varying this variable obviously affects the algorithm, in different way. The higher this variable is, the less actors we are taking into consideration. So, with a smaller graph, we are expecting better results in terms of time execution. On the other hand, we also can expect to have less accurate results. What we are going to discuss is how much changing \texttt{MINMOVIES} affects this two factors Varying this variable obviously affects the algorithm, in different way. The higher this variable is, the less actors we are taking into consideration. So, with a smaller graph, we are expecting better results in terms of time execution. On the other hand, we also can expect to have less accurate results. What we are going to discuss is how much changing \texttt{MIN\textunderscore ACTORS} affects this two factors
\subsubsection{Time of execution} \subsubsection{Time of execution}
TO DO TO DO
\subsubsection{Discrepancy of the results} \subsubsection{Discrepancy of the results}
We want to analyze how truthful our results are while varying MINMOVIES. The methodology is simple: for each results (lists) we take the intersection of the two. This will return the number of elements in common. Knowing the length of the lists, we can find the number of elements not in common. \s We want to analyze how truthful our results are while varying \texttt{MIN\textunderscore ACTORS}. The methodology is simple: for each results (lists) we take the intersection of the two. This will return the number of elements in common. Knowing the length of the lists, we can find the number of elements not in common. \s
\nd A way to see this results is with a square matrix $n \times n, ~ A = (a_{ij})$, where $n$ is the number of different values that we gave to \texttt{MINMOVIES} during the testing. In this way the $(i,j)$ position is the percentage of discrepancy between the results with \texttt{MINMOVIES} set as $i$ and $j$ \s \nd A way to see this results is with a square matrix $n \times n, ~ A = (a_{ij})$, where $n$ is the number of different values that we gave to \texttt{MIN\textunderscore ACTORS} during the testing. In this way the $(i,j)$ position is the percentage of discrepancy between the results with \texttt{MIN\textunderscore ACTORS} set as $i$ and $j$ \s
\nd This analysis is implemented in python using the \texttt{pandas} and \texttt{numpy} libraries. \nd This analysis is implemented in python using the \texttt{pandas} and \texttt{numpy} libraries.
@ -32,4 +32,8 @@ We want to analyze how truthful our results are while varying MINMOVIES. The met
\includegraphics[width=13cm]{Figure_1.png} \includegraphics[width=13cm]{Figure_1.png}
\end{figure} \end{figure}
\nd As expected, the matrix is symmetrical and the elements on the diagonal are all equal to zero. We can see clearly that with a lower value of \texttt{MINMOVIES} the results are more precise. The discrepancy with \texttt{MINMOVIES=10} is 14\% while being 39\% when \texttt{MINMOVIES=70}. \nd As expected, the matrix is symmetrical and the elements on the diagonal are all equal to zero. We can see clearly that with a lower value of \texttt{MIN\textunderscore ACTORS} the results are more precise. The discrepancy with \texttt{MIN\textunderscore ACTORS=10} is 14\% while being 39\% when \texttt{MIN\textunderscore ACTORS=70}. \s
\nd This is what we obtain confronting the top-k results when $k=100$. It's interesting to se how much the discrepancy change with different values of $k$. However, choosing a lower value for $k$ would not be useful for this type of analysis. Since we are looking at the not common elements of two lists, with a small length, we would get results biased by statistical straggling. \s
\textsc{Da fare: test con con k=500 e k=1000}

Binary file not shown.
Loading…
Cancel
Save