the movie graph now can be visualized
parent
8545589f61
commit
8eca30ecdf
@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from itertools import combinations
|
||||||
|
from math import comb
|
||||||
|
from matplotlib.pyplot import title
|
||||||
|
import networkx as nx
|
||||||
|
from pyvis.network import Network
|
||||||
|
|
||||||
|
net = Network(height='100%', width='100%', directed=False, bgcolor='#1e1f29', font_color='white')
|
||||||
|
|
||||||
|
actors_to_keep = []
|
||||||
|
farness_to_keep= []
|
||||||
|
with open('data/top_actors_c.txt') as ifs:
|
||||||
|
for line in ifs:
|
||||||
|
if line.strip():
|
||||||
|
actor_id, farness = line.split(maxsplit=1)
|
||||||
|
actors_to_keep.append(int(actor_id))
|
||||||
|
farness_to_keep.append(float(farness))
|
||||||
|
|
||||||
|
with open('data/Attori.txt') as ifs:
|
||||||
|
for line in ifs:
|
||||||
|
if line.strip():
|
||||||
|
actor_id, actor_name = line.split(maxsplit=1)
|
||||||
|
actor_id = int(actor_id)
|
||||||
|
farness = float(farness)
|
||||||
|
if actor_id in actors_to_keep:
|
||||||
|
if farness in farness_to_keep:
|
||||||
|
net.add_node(actor_id, label=actor_name, size =pow(5,1.0/(farness*2)))
|
||||||
|
|
||||||
|
movies = {} # {movie_id: [actor_id, ...]}
|
||||||
|
with open('data/Relazioni.txt') as ifs:
|
||||||
|
for line in ifs:
|
||||||
|
if line.strip():
|
||||||
|
movie_id, actor_id = line.split(maxsplit=1)
|
||||||
|
actor_id = int(actor_id)
|
||||||
|
movie_id = int(movie_id)
|
||||||
|
if actor_id not in net.node_ids:
|
||||||
|
continue
|
||||||
|
if movie_id in movies:
|
||||||
|
movies[movie_id].append(actor_id)
|
||||||
|
else:
|
||||||
|
movies[movie_id] = [actor_id]
|
||||||
|
|
||||||
|
edges = set() # set of unique tuples (actor_id, actor_id)
|
||||||
|
for movie_id, actors in movies.items():
|
||||||
|
actors.sort()
|
||||||
|
for actor_id_1, actor_id_2 in combinations(actors, 2):
|
||||||
|
edges.add((actor_id_1, actor_id_2))
|
||||||
|
for actor_id_1, actor_id_2 in edges:
|
||||||
|
net.add_edge(actor_id_1, actor_id_2)
|
||||||
|
|
||||||
|
# net.hrepulsion(node_distance=500, central_gravity=0.3, spring_length=500, spring_strength=0.05, damping=0.2)
|
||||||
|
# net.repulsion(node_distance=500, central_gravity=0.3, spring_length=200, spring_strength=0.05, damping=0.2)
|
||||||
|
# net.show_buttons()
|
||||||
|
|
||||||
|
net.set_options("""
|
||||||
|
var options = {
|
||||||
|
"nodes": {
|
||||||
|
"borderWidthSelected": 3
|
||||||
|
},
|
||||||
|
"edges": {
|
||||||
|
"color": {
|
||||||
|
"inherit": true
|
||||||
|
},
|
||||||
|
"smooth": false
|
||||||
|
},
|
||||||
|
"physics": {
|
||||||
|
"repulsion": {
|
||||||
|
"centralGravity": 8.95,
|
||||||
|
"springLength": 500,
|
||||||
|
"springConstant": 0.015,
|
||||||
|
"nodeDistance": 600,
|
||||||
|
"damping": 0.67
|
||||||
|
},
|
||||||
|
"minVelocity": 0.75,
|
||||||
|
"solver": "repulsion"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
net.show('html-files/closeness-graph.html')
|
@ -0,0 +1,75 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from itertools import combinations
|
||||||
|
from math import comb
|
||||||
|
from matplotlib.pyplot import title
|
||||||
|
import networkx as nx
|
||||||
|
from pyvis.network import Network
|
||||||
|
|
||||||
|
net = Network(height='100%', width='100%', directed=False, bgcolor='#1e1f29', font_color='white')
|
||||||
|
|
||||||
|
actors_to_keep = []
|
||||||
|
harmonic_to_keep =[]
|
||||||
|
with open('data/top_actors_h.txt') as ifs:
|
||||||
|
for line in ifs:
|
||||||
|
if line.strip():
|
||||||
|
actor_id, harmonic = line.split(maxsplit=1)
|
||||||
|
actors_to_keep.append(int(actor_id))
|
||||||
|
harmonic_to_keep.append(float(harmonic))
|
||||||
|
|
||||||
|
with open('data/Attori.txt') as ifs:
|
||||||
|
for line in ifs:
|
||||||
|
if line.strip():
|
||||||
|
actor_id, actor_name = line.split(maxsplit=1)
|
||||||
|
actor_id = int(actor_id)
|
||||||
|
harmonic = float(harmonic)
|
||||||
|
if actor_id in actors_to_keep:
|
||||||
|
if harmonic in harmonic_to_keep:
|
||||||
|
net.add_node(actor_id, label=actor_name, size = harmonic/350)
|
||||||
|
|
||||||
|
movies = {} # {movie_id: [actor_id, ...]}
|
||||||
|
with open('data/Relazioni.txt') as ifs:
|
||||||
|
for line in ifs:
|
||||||
|
if line.strip():
|
||||||
|
movie_id, actor_id = line.split(maxsplit=1)
|
||||||
|
actor_id = int(actor_id)
|
||||||
|
movie_id = int(movie_id)
|
||||||
|
if actor_id not in net.node_ids:
|
||||||
|
continue
|
||||||
|
if movie_id in movies:
|
||||||
|
movies[movie_id].append(actor_id)
|
||||||
|
else:
|
||||||
|
movies[movie_id] = [actor_id]
|
||||||
|
|
||||||
|
edges = set() # set of unique tuples (actor_id, actor_id)
|
||||||
|
for movie_id, actors in movies.items():
|
||||||
|
actors.sort()
|
||||||
|
for actor_id_1, actor_id_2 in combinations(actors, 2):
|
||||||
|
edges.add((actor_id_1, actor_id_2))
|
||||||
|
for actor_id_1, actor_id_2 in edges:
|
||||||
|
net.add_edge(actor_id_1, actor_id_2)
|
||||||
|
|
||||||
|
# net.hrepulsion(node_distance=500, central_gravity=0.3, spring_length=500, spring_strength=0.05, damping=0.2)
|
||||||
|
# net.repulsion(node_distance=500, central_gravity=0.3, spring_length=200, spring_strength=0.05, damping=0.2)
|
||||||
|
# net.show_buttons()
|
||||||
|
|
||||||
|
net.set_options("""
|
||||||
|
var options = {
|
||||||
|
"edges": {
|
||||||
|
"color": {
|
||||||
|
"inherit": true
|
||||||
|
},
|
||||||
|
"smooth": false
|
||||||
|
},
|
||||||
|
"physics": {
|
||||||
|
"repulsion": {
|
||||||
|
"springLength": 1205,
|
||||||
|
"nodeDistance": 1190
|
||||||
|
},
|
||||||
|
"maxVelocity": 23,
|
||||||
|
"minVelocity": 0.75,
|
||||||
|
"solver": "repulsion"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
net.show('html-files/harmonic-graph.html')
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,82 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import gzip
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import csv
|
||||||
|
|
||||||
|
#-----------------DOWNLOAD .GZ FILES FROM IMDB DATABASE-----------------#
|
||||||
|
def colored(r, g, b, text):
|
||||||
|
return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)
|
||||||
|
|
||||||
|
def download_url(url):
|
||||||
|
print("Downloading:", url)
|
||||||
|
file_name_start_pos = url.rfind("/") + 1
|
||||||
|
file_name = url[file_name_start_pos:]
|
||||||
|
if os.path.isfile(file_name):
|
||||||
|
print(colored(0,170,0,"Already downloaded: skipping"))
|
||||||
|
return
|
||||||
|
|
||||||
|
r = requests.get(url, stream=True)
|
||||||
|
r.raise_for_status()
|
||||||
|
with open(file_name, 'wb') as f:
|
||||||
|
for chunk in r.iter_content(chunk_size=4096):
|
||||||
|
f.write(chunk)
|
||||||
|
return url
|
||||||
|
|
||||||
|
urls = ["https://datasets.imdbws.com/name.basics.tsv.gz",
|
||||||
|
"https://datasets.imdbws.com/title.principals.tsv.gz",
|
||||||
|
"https://datasets.imdbws.com/title.basics.tsv.gz",
|
||||||
|
"https://datasets.imdbws.com/title.ratings.tsv.gz"]
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
download_url(url)
|
||||||
|
|
||||||
|
os.makedirs("data", exist_ok=True) # Generate (recursively) folders, ignores the comand if they already exists
|
||||||
|
|
||||||
|
#------------------------------FILTERING------------------------------#
|
||||||
|
|
||||||
|
print("Filtering actors...")
|
||||||
|
df_attori = pd.read_csv(
|
||||||
|
'name.basics.tsv.gz', sep='\t', compression='gzip',
|
||||||
|
usecols=['nconst', 'primaryName', 'primaryProfession'],
|
||||||
|
dtype={'primaryName': 'U', 'primaryProfession': 'U'},
|
||||||
|
converters={'nconst': lambda x: int(x.lstrip("nm0"))})
|
||||||
|
df_attori.query('primaryProfession.str.contains("actor") or primaryProfession.str.contains("actress")', inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
print("Filtering movies...")
|
||||||
|
df_film = pd.read_csv(
|
||||||
|
'title.basics.tsv.gz', sep='\t', compression='gzip',
|
||||||
|
usecols=['tconst', 'primaryTitle', 'isAdult', 'titleType'], # Considering only this columns
|
||||||
|
dtype={'primaryTitle': 'U', 'titleType': 'U'}, # Both are unsigned integers
|
||||||
|
converters={'tconst': lambda x: int(x.lstrip("t0")), 'isAdult': lambda x: x != "0"}) # All movies starts with t0, we are just cleaning the output. Then remove all adult movies
|
||||||
|
df_ratings = pd.read_csv(
|
||||||
|
'title.ratings.tsv.gz', sep='\t', compression='gzip',
|
||||||
|
usecols=['tconst', 'numVotes'],
|
||||||
|
dtype={'numVotes': 'u8'}, # Unsigned integer
|
||||||
|
converters={'tconst': lambda x: int(x.lstrip("t0"))})
|
||||||
|
df_film = pd.merge(df_film, df_ratings, "left", on="tconst")
|
||||||
|
del df_ratings
|
||||||
|
df_film.query('not isAdult and titleType in ["movie", "tvSeries", "tvMovie", "tvMiniSeries"]',
|
||||||
|
inplace=True)
|
||||||
|
VOTES_MEAN = int(200000)
|
||||||
|
df_film.query('numVotes > @VOTES_MEAN', inplace=True)
|
||||||
|
filtered_tconsts = df_film["tconst"].to_list()
|
||||||
|
|
||||||
|
print("Filtering relations...")
|
||||||
|
df_relazioni = pd.read_csv(
|
||||||
|
'title.principals.tsv.gz', sep='\t', compression='gzip',
|
||||||
|
usecols=['tconst', 'nconst','category'], # Considering only this columns
|
||||||
|
dtype={'category': 'U'}, # Unsigned integer
|
||||||
|
converters={'nconst': lambda x: int(x.lstrip("nm0")), 'tconst': lambda x: int(x.lstrip("t0"))}) # Cleaning
|
||||||
|
df_relazioni.query('(category == "actor" or category == "actress") and tconst in @filtered_tconsts', inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Write the filtered files
|
||||||
|
df_attori.to_csv('data/Attori.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['nconst', 'primaryName'], header=False, index=False)
|
||||||
|
|
||||||
|
df_film.to_csv('data/FilmFiltrati.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'primaryTitle'], header=False, index=False)
|
||||||
|
|
||||||
|
df_relazioni.to_csv('data/Relazioni.txt', sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\', columns=['tconst', 'nconst'], header=False, index=False)
|
@ -0,0 +1,72 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from itertools import combinations
|
||||||
|
from math import comb
|
||||||
|
import networkx as nx
|
||||||
|
from pyvis.network import Network
|
||||||
|
|
||||||
|
net = Network(height='100%', width='100%', directed=False, bgcolor='#1e1f29', font_color='white')
|
||||||
|
with open('data/FilmFiltrati.txt') as ifs:
|
||||||
|
for line in ifs:
|
||||||
|
if line.strip():
|
||||||
|
movie_id, movie_name = line.split(maxsplit=1)
|
||||||
|
net.add_node(int(movie_id), label=movie_name)
|
||||||
|
|
||||||
|
actors = {} # {actor_id: [movie_id, ...]}
|
||||||
|
with open('data/Relazioni.txt') as ifs:
|
||||||
|
for line in ifs:
|
||||||
|
if line.strip():
|
||||||
|
movie_id, actor_id = line.split(maxsplit=1)
|
||||||
|
actor_id = int(actor_id)
|
||||||
|
movie_id = int(movie_id)
|
||||||
|
if movie_id not in net.node_ids:
|
||||||
|
continue
|
||||||
|
if actor_id in actors:
|
||||||
|
actors[actor_id].append(movie_id)
|
||||||
|
else:
|
||||||
|
actors[actor_id] = [movie_id]
|
||||||
|
|
||||||
|
edges = set() # set of unique tuples (actor_id, actor_id)
|
||||||
|
for actor_id, actors in actors.items():
|
||||||
|
actors.sort()
|
||||||
|
for movie_id_1, movie_id_2 in combinations(actors, 2):
|
||||||
|
edges.add((movie_id_1, movie_id_2))
|
||||||
|
for movie_id_1, movie_id_2 in edges:
|
||||||
|
net.add_edge(movie_id_1, movie_id_2)
|
||||||
|
|
||||||
|
# net.hrepulsion(node_distance=500, central_gravity=0.3, spring_length=500, spring_strength=0.05, damping=0.2)
|
||||||
|
# net.repulsion(node_distance=500, central_gravity=0.3, spring_length=200, spring_strength=0.05, damping=0.2)
|
||||||
|
# net.show_buttons()
|
||||||
|
|
||||||
|
net.set_options(""""
|
||||||
|
var options = {
|
||||||
|
"nodes": {
|
||||||
|
"shapeProperties": {
|
||||||
|
"borderRadius": 11
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"edges": {
|
||||||
|
"color": {
|
||||||
|
"inherit": true
|
||||||
|
},
|
||||||
|
"font": {
|
||||||
|
"size": 32
|
||||||
|
},
|
||||||
|
"smooth": false
|
||||||
|
},
|
||||||
|
"physics": {
|
||||||
|
"forceAtlas2Based": {
|
||||||
|
"gravitationalConstant": -443,
|
||||||
|
"centralGravity": 0.005,
|
||||||
|
"springLength": 255,
|
||||||
|
"springConstant": 0.07,
|
||||||
|
"damping": 0.91,
|
||||||
|
"avoidOverlap": 0.06
|
||||||
|
},
|
||||||
|
"maxVelocity": 57,
|
||||||
|
"minVelocity": 0.75,
|
||||||
|
"solver": "forceAtlas2Based"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
net.show('html-files/imdb-movie-graph.html')
|
Loading…
Reference in New Issue