imdb-graph/filtro.py

#!/usr/bin/env python3
import requests
import pandas as pd
import numpy as np
import os
import csv

MIN_MOVIES = 42  # Only keep relations for actors that have made more than this many movies

#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------#

def download_url(url):
  print("Downloading:", url)
  file_name_start_pos = url.rfind("/") + 1
  file_name = url[file_name_start_pos:]
  if os.path.isfile(file_name):
    print("Already downloaded: skipping")
    return

  r = requests.get(url, stream=True)
  r.raise_for_status()
  with open(file_name, 'wb') as f:
    for chunk in r.iter_content(chunk_size=4096):
      f.write(chunk)
  return url

urls = ["https://datasets.imdbws.com/name.basics.tsv.gz",
        "https://datasets.imdbws.com/title.principals.tsv.gz",
        "https://datasets.imdbws.com/title.basics.tsv.gz"]

for url in urls:
  download_url(url)

os.makedirs("data", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists

#------------------------------FILTERING------------------------------#

print("Filtering actors...")
df_attori = pd.read_csv(
  'name.basics.tsv.gz', sep='\t', compression='gzip',
  usecols=['nconst', 'primaryName', 'primaryProfession'], # Considering only this columns
  dtype={'primaryName': 'U', 'primaryProfession': 'U'}, # Both are unsigned integers
  converters={'nconst': lambda x: int(x.lstrip("nm0"))}) # All actors starts with nm0, we are just cleaning the output
df_attori.query('primaryProfession.str.contains("actor") or primaryProfession.str.contains("actress")', inplace=True)
# A lot of actors/actresses do more than one job (director etc..), with this comand I take all the names that have the string "actor" or "actress" in their profession. In this way both someone who is classified as "actor" or as "actor, director" are taken into consideration

print("Filtering films...")
df_film = pd.read_csv(
  'title.basics.tsv.gz', sep='\t', compression='gzip',
  usecols=['tconst', 'primaryTitle', 'isAdult', 'titleType'], # Considering only this columns
  dtype={'primaryTitle': 'U', 'titleType': 'U'}, # Both are unsigned integers
  converters={'tconst': lambda x: int(x.lstrip("t0")), 'isAdult': lambda x: x != "0"}) # # All movies starts with t0, we are just cleaning the output. Then remove all adult movies
df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]',
              inplace=True) # There are a lot of junk categories considered in IMDb, we are considering all the non Adult movies in this whitelist
filtered_tconsts = df_film["tconst"].to_list()

print("Filtering relations...")
df_relazioni = pd.read_csv(
  'title.principals.tsv.gz', sep='\t', compression='gzip',
  usecols=['tconst', 'nconst','category'],  # Considering only this columns
  dtype={'category': 'U'}, # Unsigned integer
  converters={'nconst': lambda x: int(x.lstrip("nm0")), 'tconst': lambda x: int(x.lstrip("t0"))}) # Cleaning
df_relazioni.query('(category == "actor" or category == "actress") and tconst in @filtered_tconsts', inplace=True)
# Returns an array of unique actor ids (nconsts) and an array of how many times they appear (counts) => the number of movies they appear in
nconsts, counts = np.unique(df_relazioni["nconst"].to_numpy(), return_counts=True)
filtered_nconsts = nconsts[counts>=MIN_MOVIES]
df_relazioni.query("nconst in @filtered_nconsts", inplace=True)

# Now select only films and actors that have at lest a relation
print("Re-filtering actors...")
nconsts_with_relations = df_relazioni["nconst"].unique()
df_attori.query("nconst in @nconsts_with_relations", inplace=True)
print("Re-filtering films...")
tconsts_with_relations = df_relazioni["tconst"].unique()
df_film.query("tconst in @tconsts_with_relations", inplace=True)

# Write the filtered files
df_attori.to_csv('data/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['nconst', 'primaryName'], header=False, index=False)
df_film.to_csv('data/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False)
df_relazioni.to_csv('data/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False)

# Takes about 1 min 30 s with MIN_MOVIES = 42 ----> kenobi with k=100 took 3m 48s
# Takes about 3 min with MIN_MOVIES = 20 ----> kenobi with k=100 took 19m 34s
Fully working, clean outputs. Still work to do 3 years ago			`#!/usr/bin/env python3`
Incomplete filter, to do 3 years ago			`import requests`
			`import pandas as pd`
Consider only actors with many films 3 years ago			`import numpy as np`
Fully working, clean outputs. Still work to do 3 years ago			`import os`
			`import csv`

Refined docs 3 years ago			`MIN_MOVIES = 42 # Only keep relations for actors that have made more than this many movies`
Consider only actors with many films 3 years ago
detaild comments added for clarification 3 years ago			`#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------#`

Incomplete filter, to do 3 years ago			`def download_url(url):`
Fully working, clean outputs. Still work to do 3 years ago			`print("Downloading:", url)`
Incomplete filter, to do 3 years ago			`file_name_start_pos = url.rfind("/") + 1`
			`file_name = url[file_name_start_pos:]`
Fully working, clean outputs. Still work to do 3 years ago			`if os.path.isfile(file_name):`
			`print("Already downloaded: skipping")`
			`return`

Incomplete filter, to do 3 years ago			`r = requests.get(url, stream=True)`
Fully working, clean outputs. Still work to do 3 years ago			`r.raise_for_status()`
			`with open(file_name, 'wb') as f:`
			`for chunk in r.iter_content(chunk_size=4096):`
			`f.write(chunk)`
Incomplete filter, to do 3 years ago			`return url`

Fully working, clean outputs. Still work to do 3 years ago			`urls = ["https://datasets.imdbws.com/name.basics.tsv.gz",`
Incomplete filter, to do 3 years ago			`"https://datasets.imdbws.com/title.principals.tsv.gz",`
			`"https://datasets.imdbws.com/title.basics.tsv.gz"]`

Fully working, clean outputs. Still work to do 3 years ago			`for url in urls:`
			`download_url(url)`

detaild comments added for clarification 3 years ago			`os.makedirs("data", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists`
Fully working, clean outputs. Still work to do 3 years ago
now every thing is done and complete. TODO: better comments, documentation 3 years ago			`#------------------------------FILTERING------------------------------#`

Fully working, clean outputs. Still work to do 3 years ago			`print("Filtering actors...")`
			`df_attori = pd.read_csv(`
			`'name.basics.tsv.gz', sep='\t', compression='gzip',`
detaild comments added for clarification 3 years ago			`usecols=['nconst', 'primaryName', 'primaryProfession'], # Considering only this columns`
			`dtype={'primaryName': 'U', 'primaryProfession': 'U'}, # Both are unsigned integers`
			`converters={'nconst': lambda x: int(x.lstrip("nm0"))}) # All actors starts with nm0, we are just cleaning the output`
Fully working, clean outputs. Still work to do 3 years ago			`df_attori.query('primaryProfession.str.contains("actor") or primaryProfession.str.contains("actress")', inplace=True)`
detaild comments added for clarification 3 years ago			`# A lot of actors/actresses do more than one job (director etc..), with this comand I take all the names that have the string "actor" or "actress" in their profession. In this way both someone who is classified as "actor" or as "actor, director" are taken into consideration`
Fully working, clean outputs. Still work to do 3 years ago
			`print("Filtering films...")`
			`df_film = pd.read_csv(`
			`'title.basics.tsv.gz', sep='\t', compression='gzip',`
detaild comments added for clarification 3 years ago			`usecols=['tconst', 'primaryTitle', 'isAdult', 'titleType'], # Considering only this columns`
			`dtype={'primaryTitle': 'U', 'titleType': 'U'}, # Both are unsigned integers`
			`converters={'tconst': lambda x: int(x.lstrip("t0")), 'isAdult': lambda x: x != "0"}) # # All movies starts with t0, we are just cleaning the output. Then remove all adult movies`
Fully working, clean outputs. Still work to do 3 years ago			`df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]',`
detaild comments added for clarification 3 years ago			`inplace=True) # There are a lot of junk categories considered in IMDb, we are considering all the non Adult movies in this whitelist`
Fully working, clean outputs. Still work to do 3 years ago			`filtered_tconsts = df_film["tconst"].to_list()`

			`print("Filtering relations...")`
			`df_relazioni = pd.read_csv(`
			`'title.principals.tsv.gz', sep='\t', compression='gzip',`
detaild comments added for clarification 3 years ago			`usecols=['tconst', 'nconst','category'], # Considering only this columns`
			`dtype={'category': 'U'}, # Unsigned integer`
			`converters={'nconst': lambda x: int(x.lstrip("nm0")), 'tconst': lambda x: int(x.lstrip("t0"))}) # Cleaning`
Fully working, clean outputs. Still work to do 3 years ago			`df_relazioni.query('(category == "actor" or category == "actress") and tconst in @filtered_tconsts', inplace=True)`
Consider only actors with many films 3 years ago			`# Returns an array of unique actor ids (nconsts) and an array of how many times they appear (counts) => the number of movies they appear in`
			`nconsts, counts = np.unique(df_relazioni["nconst"].to_numpy(), return_counts=True)`
			`filtered_nconsts = nconsts[counts>=MIN_MOVIES]`
			`df_relazioni.query("nconst in @filtered_nconsts", inplace=True)`

			`# Now select only films and actors that have at lest a relation`
			`print("Re-filtering actors...")`
			`nconsts_with_relations = df_relazioni["nconst"].unique()`
			`df_attori.query("nconst in @nconsts_with_relations", inplace=True)`
			`print("Re-filtering films...")`
			`tconsts_with_relations = df_relazioni["tconst"].unique()`
			`df_film.query("tconst in @tconsts_with_relations", inplace=True)`

			`# Write the filtered files`
			`df_attori.to_csv('data/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['nconst', 'primaryName'], header=False, index=False)`
			`df_film.to_csv('data/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False)`
Fully working, clean outputs. Still work to do 3 years ago			`df_relazioni.to_csv('data/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False)`
Consider only actors with many films 3 years ago
documentation 3 years ago			`# Takes about 1 min 30 s with MIN_MOVIES = 42 ----> kenobi with k=100 took 3m 48s`
			`# Takes about 3 min with MIN_MOVIES = 20 ----> kenobi with k=100 took 19m 34s`