"cells": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"import wget\n",
"import zipfile\n",
"import numpy as np\n",
"import pandas as pd\n",
"import networkx as nx\n",
"import plotly.graph_objects as go\n",
"from utils import *\n",
"from collections import Counter\n",
"from tqdm import tqdm\n",
"import time\n",
"# ignore warnings\n",
"import warnings\n",
"# import the graphs from the saved files\n",
"G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
"G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n",
"G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n",
"G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n",
"G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n",
"G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n",
"G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n",
"G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n",
"# # open the dataframe object\n",
"# analysis_results = pd.read_pickle('analysis_results.pkl')\n",
"# analysis_results"
"The first thing that we want to do is very simple, create a random reference for each graph"
"analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
"friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
"graphs_all = checkins_graphs + friendships_graph"
"# Original Graphs\n",
"for graph in graphs_all:\n",
" # add basic graph statistics\n",
" analysis_results = analysis_results.append(\n",
" {'Graph':, \n",
" 'Number of Nodes': graph.number_of_nodes(), \n",
" 'log N': np.log(graph.number_of_nodes()),\n",
" 'Number of Edges': graph.number_of_edges()}, \n",
" ignore_index=True)\n",
" # add average degree\n",
" print(\"Computing average degree for graph: \",\n",
" avg_deg = np.mean([d for n, d in])\n",
" analysis_results.loc[analysis_results['Graph'] ==, 'Average Degree'] = avg_deg\n",
" # add average clustering coefficient\n",
" print(\"Computing average clustering coefficient for graph: \",\n",
" avg_clustering = nx.average_clustering(graph)\n",
" analysis_results.loc[analysis_results['Graph'] ==, 'Average Clustering Coefficient'] = avg_clustering\n",
" # add average shortest path length\n",
" print(\"Computing average shortest path length for graph: \",\n",
" average_shortest_path_length = average_shortest_path(graph)\n",
" analysis_results.loc[analysis_results['Graph'] ==, 'Average Shortest Path Length'] = average_shortest_path_length\n",
" # add betweenness centrality\n",
" print(\"Computing betweenness centrality for graph: \",\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6).values()))\n",
" analysis_results.loc[analysis_results['Graph'] ==, 'betweenness centrality'] = betweenness_centrality\n",
" print()\n",
"# Random shit"
"analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"# read all the graphs gpickle files in the data/random/erdos folder. Then run the same analysis as before for this graphs\n",
"for graph in graphs_all:\n",
" G = create_random_graphs(graph, \"erods\")\n",
" # add the basic information to the dataframe\n",
" analysis_results_erods = analysis_results_erods.append({\n",
" 'Graph':,\n",
" 'Number of Nodes': G.number_of_nodes(),\n",
" 'Number of Edges': G.number_of_edges(),\n",
" 'log N': np.log(G.number_of_nodes())\n",
" }, ignore_index=True)\n",
" # compute the average degree and add it to the dataframe\n",
" avg_deg = np.mean([d for n, d in])\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] ==, 'Average Degree'] = avg_deg\n",
" # compute the average clustering coefficient and add it to the dataframe\n",
" avg_clustering = nx.average_clustering(G)\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] ==, 'Average Clustering Coefficient'] = avg_clustering\n",
" # compute the average shortest path length and add it to the dataframe\n",
" average_shortest_path_length = average_shortest_path(G)\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] ==, 'Average Shortest Path Length'] = average_shortest_path_length\n",
" # compute the betweenness centrality and add it to the dataframe\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] ==, 'betweenness centrality'] = betweenness_centrality\n",
" # save memory\n",
" del G\n",
"\tNumber of edges in the original graph: 3663807\n",
# (Error traceback removed)
"# do the same with the watts strogatz graphs\n",
"analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"for graph in graphs_all:\n",
" G = create_random_graphs(graph, 'watts_strogatz', save=False)\n",
" # add the basic information to the dataframe\n",
" analysis_results_ws = analysis_results_ws.append({\n",
" 'Graph':,\n",
" 'Number of Nodes': G.number_of_nodes(),\n",
" 'Number of Edges': G.number_of_edges(),\n",
" 'log N': np.log(G.number_of_nodes())\n",
" }, ignore_index=True)\n",
" # compute the average degree and add it to the dataframe\n",
" avg_deg = np.mean([d for n, d in])\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] ==, 'Average Degree'] = avg_deg\n",
" # compute the average clustering coefficient and add it to the dataframe\n",
" avg_clustering = nx.average_clustering(G)\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] ==, 'Average Clustering Coefficient'] = avg_clustering\n",
" # compute the average shortest path length and add it to the dataframe\n",
" average_shortest_path_length = average_shortest_path(G)\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] ==, 'Average Shortest Path Length'] = average_shortest_path_length\n",
" # compute the betweenness centrality and add it to the dataframe\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] ==, 'betweenness centrality'] = betweenness_centrality\n",
" # save memory\n",
" del G\n",
"import matplotlib.pyplot as plt"
"G = nx.watts_strogatz_graph(1000, 4, 0.1)\n",
"adj = nx.to_scipy_sparse_array(G)\n",
"# print info about the graph and the matrix\n",
"print(\"Number of nodes: \", G.number_of_nodes())\n",
"print(\"Number of edges: \", G.number_of_edges())\n",
"print(\"Average degree: \", np.mean([d for n, d in]))\n",
"print(\"Average clustering coefficient: \", nx.average_clustering(G))\n",
"print(\"Average shortest path length: \", nx.average_shortest_path_length(G))"
"import scipy.sparse as sp\n",
"# randomly swap edges, but keep the degree of each node the same (i.e. the degree sequence is preserved)\n",
"def random_swap_edges(adj, nswap=1, max_tries=100):\n",
" # use numpy and scipy to speed up the process\n",
" adj = sp.csr_matrix(adj)\n",
" n, m = adj.shape \n",
" assert n == m # make sure the adjacency matrix is square\n",
" adj_triu = sp.triu(adj) # only consider the upper triangular part of the adjacency matrix\n",
" adj_tuple = sp.find(adj_triu) # get the indices and values of the non-zero elements\n",
" adj_edges = np.array(list(zip(adj_tuple[0], adj_tuple[1]))) # get the edges\n",
" adj_data = adj_tuple[2] # get the edge weights\n",
" nnz = adj_edges.shape[0] # number of non-zero elements\n",
" assert nnz == adj_data.shape[0] # make sure the number of edges and edge weights are the same\n",
" for _ in range(nswap): # repeat nswap times\n",
" # choose random edges to swap\n",
" edge_idx = np.random.choice(nnz, size=2, replace=False) # choose two random edges\n",
" edge1 = adj_edges[edge_idx[0]] # get the first edge\n",
" edge2 = adj_edges[edge_idx[1]] # get the second edge\n",
" # make sure the edges are not self-loops and not already connected\n",
" if edge1[0] == edge2[0] or edge1[0] == edge2[1] or edge1[1] == edge2[0] or edge1[1] == edge2[1] or adj[edge1[0], edge2[1]] or adj[edge2[0], edge1[1]]: \n",
" continue # if the edges are self-loops or already connected, try again\n",
" # swap the edges\n",
" adj[edge1[0], edge1[1]] = 0 \n",
" adj[edge2[0], edge2[1]] = 0 \n",
" adj[edge1[0], edge2[1]] = 1\n",
" adj[edge2[0], edge1[1]] = 1\n",
" # update adj_edges and adj_data\n",
" adj_edges[edge_idx[0]] = [edge1[0], edge2[1]]\n",
" adj_edges[edge_idx[1]] = [edge2[0], edge1[1]]\n",
" adj_data[edge_idx[0]] = 1\n",
" adj_data[edge_idx[1]] = 1\n",
" return adj\n",
"adj_swapped = random_swap_edges(adj, nswap=1)"
"# create a new graph from the swapped adjacency matrix\n",
"G_swapped = nx.from_scipy_sparse_matrix(adj_swapped)\n",
"# print info about the graph and the matrix\n",
"print(\"Number of nodes: \", G_swapped.number_of_nodes())\n",
"print(\"Number of edges: \", G_swapped.number_of_edges())\n",
"print(\"Average degree: \", np.mean([d for n, d in]))\n",
"print(\"Average clustering coefficient: \", nx.average_clustering(G_swapped))\n",
"print(\"Average shortest path length: \", nx.average_shortest_path_length(G_swapped))"
