From f26f632f73b43f5df5cdf9ec2c43b809e83df33a Mon Sep 17 00:00:00 2001 From: Luca Lombardo Date: Wed, 16 Mar 2022 16:47:47 +0100 Subject: [PATCH] moved --- .../movie_graph/movie_graph_filter.py | 82 ------------------- 1 file changed, 82 deletions(-) delete mode 100755 visualization/movie_graph/movie_graph_filter.py diff --git a/visualization/movie_graph/movie_graph_filter.py b/visualization/movie_graph/movie_graph_filter.py deleted file mode 100755 index 9895642..0000000 --- a/visualization/movie_graph/movie_graph_filter.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python3 -import gzip -import requests -import pandas as pd -import numpy as np -import os -import csv - -#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------# -def colored(r, g, b, text): - return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text) - -def download_url(url): - print("Downloading:", url) - file_name_start_pos = url.rfind("/") + 1 - file_name = url[file_name_start_pos:] - if os.path.isfile(file_name): - print(colored(0,170,0,"Already downloaded: skipping")) - return - - r = requests.get(url, stream=True) - r.raise_for_status() - with open(file_name, 'wb') as f: - for chunk in r.iter_content(chunk_size=4096): - f.write(chunk) - return url - -urls = ["https://datasets.imdbws.com/name.basics.tsv.gz", - "https://datasets.imdbws.com/title.principals.tsv.gz", - "https://datasets.imdbws.com/title.basics.tsv.gz", - "https://datasets.imdbws.com/title.ratings.tsv.gz"] - -for url in urls: - download_url(url) - -os.makedirs("data", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists - -#------------------------------FILTERING------------------------------# - -print("Filtering actors...") -df_attori = pd.read_csv( - 'name.basics.tsv.gz', sep='\t', compression='gzip', - usecols=['nconst', 'primaryName', 'primaryProfession'], - dtype={'primaryName': 'U', 'primaryProfession': 'U'}, - converters={'nconst': lambda x: int(x.lstrip("nm0"))}) -df_attori.query('primaryProfession.str.contains("actor") or primaryProfession.str.contains("actress")', inplace=True) - - -print("Filtering movies...") -df_film = pd.read_csv( - 'title.basics.tsv.gz', sep='\t', compression='gzip', - usecols=['tconst', 'primaryTitle', 'isAdult', 'titleType'], # Considering only this columns - dtype={'primaryTitle': 'U', 'titleType': 'U'}, # Both are unsigned integers - converters={'tconst': lambda x: int(x.lstrip("t0")), 'isAdult': lambda x: x != "0"}) # All movies starts with t0, we are just cleaning the output. Then remove all adult movies -df_ratings = pd.read_csv( - 'title.ratings.tsv.gz', sep='\t', compression='gzip', - usecols=['tconst', 'numVotes'], - dtype={'numVotes': 'u8'}, # Unsigned integer - converters={'tconst': lambda x: int(x.lstrip("t0"))}) -df_film = pd.merge(df_film, df_ratings, "left", on="tconst") -del df_ratings -df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]', - inplace=True) -VOTES_MEAN = int(200000) -df_film.query('numVotes > @VOTES_MEAN', inplace=True) -filtered_tconsts = df_film["tconst"].to_list() - -print("Filtering relations...") -df_relazioni = pd.read_csv( - 'title.principals.tsv.gz', sep='\t', compression='gzip', - usecols=['tconst', 'nconst','category'], # Considering only this columns - dtype={'category': 'U'}, # Unsigned integer - converters={'nconst': lambda x: int(x.lstrip("nm0")), 'tconst': lambda x: int(x.lstrip("t0"))}) # Cleaning -df_relazioni.query('(category == "actor" or category == "actress") and tconst in @filtered_tconsts', inplace=True) - - -# Write the filtered files -df_attori.to_csv('data/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['nconst', 'primaryName'], header=False, index=False) - -df_film.to_csv('data/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False) - -df_relazioni.to_csv('data/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False)