From 28637280d259ba2dff48a8f7a5e263026f19971c Mon Sep 17 00:00:00 2001 From: Luca Lombardo Date: Mon, 31 Jan 2022 15:55:07 +0100 Subject: [PATCH] Incomplete filter, to do --- .gitignore | 4 ++++ filtro.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 .gitignore create mode 100644 filtro.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7fb1b24 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +data/ +grafo +.vscode/ +kenobi \ No newline at end of file diff --git a/filtro.py b/filtro.py new file mode 100644 index 0000000..c30a910 --- /dev/null +++ b/filtro.py @@ -0,0 +1,65 @@ +import requests +from multiprocessing.pool import ThreadPool +import gzip +import pandas as pd +# import os + +def download_url(url): + print("downloading: ",url) + file_name_start_pos = url.rfind("/") + 1 + file_name = url[file_name_start_pos:] + + r = requests.get(url, stream=True) + if r.status_code == requests.codes.ok: + with open(file_name, 'wb') as f: + for data in r: + f.write(data) + return url + +urls = ["https://datasets.imdbws.com/name.basics.tsv.gz", + "https://datasets.imdbws.com/title.principals.tsv.gz", + "https://datasets.imdbws.com/title.basics.tsv.gz"] + +# Run 3 multiple threads. Each call will take the next element in urls list +results = ThreadPool(3).imap_unordered(download_url, urls) +for r in results: + print(r) + +def titlebasics(): + df = pd.read_csv('title.basics.tsv.gz', sep='\t', usecols=['tconst', 'primaryTitle', 'isAdult'], compression='gzip') + df.query('isAdult != 1', inplace=True) + df.to_csv('FilmFiltrati.txt', sep=' ', columns=['tconst', 'primaryTitle'], header=False) + +def namebasics(): + df = pd.read_csv('name.basics.tsv.gz', sep='\t', usecols=['nconst', 'primaryName', 'primaryProfession'], compression='gzip') + df.query('primaryProfession == "actor" or primaryProfession == "actress"', inplace=True) + df.to_csv('Attori.txt', sep=' ', columns=['nconst', 'primaryName'], header=False) + +def titleprincipals(): + df = pd.read_csv('title.principals.tsv.gz', sep='\t', usecols=['nconst','category'], compression='gzip') + df.query('category == "actor" or category == "actress"', inplace=True) + df.to_csv('') #DA FARE + + + +titlebasics() +namebasics() +# titleprincipals() + + + +# def cancella(): +# os.system('rm *.gz') + + + + + + + + + + + + +