{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import wget\n", "import zipfile\n", "import numpy as np\n", "import pandas as pd\n", "import networkx as nx\n", "import plotly.graph_objects as go\n", "from utils import *\n", "from collections import Counter\n", "from tqdm import tqdm\n", "import time\n", "\n", "# ignore warnings\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# import the graphs from the saved files\n", "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n", "G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n", "G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n", "G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n", "\n", "G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n", "G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n", "G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n", "G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n", "\n", "# # open the dataframe object\n", "# analysis_results = pd.read_pickle('analysis_results.pkl')\n", "# analysis_results" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "The first thing that we want to do is very simple, create a random reference for each graph" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n", "\n", "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n", "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n", "\n", "graphs_all = checkins_graphs + friendships_graph" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Original Graphs\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for graph in graphs_all:\n", " # add basic graph statistics\n", " analysis_results = analysis_results.append(\n", " {'Graph': graph.name, \n", " 'Number of Nodes': graph.number_of_nodes(), \n", " 'log N': np.log(graph.number_of_nodes()),\n", " 'Number of Edges': graph.number_of_edges()}, \n", " ignore_index=True)\n", "\n", " # add average degree\n", " print(\"Computing average degree for graph: \", graph.name)\n", " avg_deg = np.mean([d for n, d in graph.degree()])\n", " analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Degree'] = avg_deg\n", "\n", " # add average clustering coefficient\n", " print(\"Computing average clustering coefficient for graph: \", graph.name)\n", " avg_clustering = nx.average_clustering(graph)\n", " analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Clustering Coefficient'] = avg_clustering\n", "\n", " # add average shortest path length\n", " print(\"Computing average shortest path length for graph: \", graph.name)\n", " average_shortest_path_length = average_shortest_path(graph)\n", " analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Shortest Path Length'] = average_shortest_path_length\n", "\n", " # add betweenness centrality\n", " print(\"Computing betweenness centrality for graph: \", graph.name)\n", " betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6).values()))\n", " analysis_results.loc[analysis_results['Graph'] == graph.name, 'betweenness centrality'] = betweenness_centrality\n", " print()\n", "\n", "\n", "analysis_results\n", "analysis_results.to_pickle('analysis_results.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Random shit" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'NoneType' object has no attribute 'name'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m 6\u001b[0m G \u001b[38;5;241m=\u001b[39m create_random_graphs(graph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merods\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# add the basic information to the dataframe\u001b[39;00m\n\u001b[1;32m 9\u001b[0m analysis_results_erods \u001b[38;5;241m=\u001b[39m analysis_results_erods\u001b[38;5;241m.\u001b[39mappend({\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mG\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m,\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Nodes\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_nodes(),\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Edges\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_edges(),\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog N\u001b[39m\u001b[38;5;124m'\u001b[39m: np\u001b[38;5;241m.\u001b[39mlog(G\u001b[38;5;241m.\u001b[39mnumber_of_nodes())\n\u001b[1;32m 14\u001b[0m }, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# compute the average degree and add it to the dataframe\u001b[39;00m\n\u001b[1;32m 17\u001b[0m avg_deg \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmean([d \u001b[38;5;28;01mfor\u001b[39;00m n, d \u001b[38;5;129;01min\u001b[39;00m G\u001b[38;5;241m.\u001b[39mdegree()])\n", "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'name'" ] } ], "source": [ "analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n", "\n", "# read all the graphs gpickle files in the data/random/erdos folder. Then run the same analysis as before for this graphs\n", "\n", "for graph in graphs_all:\n", " G = create_random_graphs(graph, \"erods\")\n", "\n", " # add the basic information to the dataframe\n", " analysis_results_erods = analysis_results_erods.append({\n", " 'Graph': G.name,\n", " 'Number of Nodes': G.number_of_nodes(),\n", " 'Number of Edges': G.number_of_edges(),\n", " 'log N': np.log(G.number_of_nodes())\n", " }, ignore_index=True)\n", "\n", " # compute the average degree and add it to the dataframe\n", " avg_deg = np.mean([d for n, d in G.degree()])\n", " analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Degree'] = avg_deg\n", "\n", " # compute the average clustering coefficient and add it to the dataframe\n", " avg_clustering = nx.average_clustering(G)\n", " analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n", "\n", " # compute the average shortest path length and add it to the dataframe\n", " average_shortest_path_length = average_shortest_path(G)\n", " analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n", "\n", " # compute the betweenness centrality and add it to the dataframe\n", " betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n", " analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n", "\n", " # save memory\n", " del G\n", "\n", "analysis_results_erods.to_pickle('analysis_results_erods.pkl')\n", "analysis_results_erods\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\tNumber of edges in the original graph: 3663807\n", "\tNumber of edges in the random graph: 3660219\n" ] }, { "ename": "UnboundLocalError", "evalue": "local variable 'G_copy' referenced before assignment", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[7], line 25\u001b[0m\n\u001b[1;32m 22\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Clustering Coefficient\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m avg_clustering\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# compute the average shortest path length and add it to the dataframe\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m average_shortest_path_length \u001b[38;5;241m=\u001b[39m \u001b[43maverage_shortest_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Shortest Path Length\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m average_shortest_path_length\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# compute the betweenness centrality and add it to the dataframe\u001b[39;00m\n", "File \u001b[0;32m~/github/small-worlds/utils.py:497\u001b[0m, in \u001b[0;36maverage_shortest_path\u001b[0;34m(G, k)\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m\\t\u001b[39;00m\u001b[39mNumber of edges after removing \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m% o\u001b[39;00m\u001b[39mf nodes: \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m.\u001b[39mformat((k)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m, G_copy\u001b[39m.\u001b[39mnumber_of_edges()))\n\u001b[1;32m 496\u001b[0m tmp \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m--> 497\u001b[0m connected_components \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(nx\u001b[39m.\u001b[39mconnected_components(G_copy))\n\u001b[1;32m 498\u001b[0m \u001b[39m# remove all the connected components with less than 10 nodes\u001b[39;00m\n\u001b[1;32m 499\u001b[0m connected_components \u001b[39m=\u001b[39m [c \u001b[39mfor\u001b[39;00m c \u001b[39min\u001b[39;00m connected_components \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(c) \u001b[39m>\u001b[39m \u001b[39m10\u001b[39m]\n", "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'G_copy' referenced before assignment" ] } ], "source": [ "# do the same with the watts strogatz graphs\n", "\n", "analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n", "\n", "for graph in graphs_all:\n", " G = create_random_graphs(graph, 'watts_strogatz', save=False)\n", "\n", " # add the basic information to the dataframe\n", " analysis_results_ws = analysis_results_ws.append({\n", " 'Graph': G.name,\n", " 'Number of Nodes': G.number_of_nodes(),\n", " 'Number of Edges': G.number_of_edges(),\n", " 'log N': np.log(G.number_of_nodes())\n", " }, ignore_index=True)\n", "\n", " # compute the average degree and add it to the dataframe\n", " avg_deg = np.mean([d for n, d in G.degree()])\n", " analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Degree'] = avg_deg\n", "\n", " # compute the average clustering coefficient and add it to the dataframe\n", " avg_clustering = nx.average_clustering(G)\n", " analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n", "\n", " # compute the average shortest path length and add it to the dataframe\n", " average_shortest_path_length = average_shortest_path(G)\n", " analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n", "\n", " # compute the betweenness centrality and add it to the dataframe\n", " betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n", " analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n", "\n", " # save memory\n", " del G\n", "\n", "analysis_results_ws.to_pickle('analysis_results_ws.pkl')\n", "analysis_results_ws" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G = nx.watts_strogatz_graph(1000, 4, 0.1)\n", "adj = nx.to_scipy_sparse_array(G)\n", "# print info about the graph and the matrix\n", "print(\"Number of nodes: \", G.number_of_nodes())\n", "print(\"Number of edges: \", G.number_of_edges())\n", "print(\"Average degree: \", np.mean([d for n, d in G.degree()]))\n", "print(\"Average clustering coefficient: \", nx.average_clustering(G))\n", "print(\"Average shortest path length: \", nx.average_shortest_path_length(G))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scipy.sparse as sp\n", "\n", "# randomly swap edges, but keep the degree of each node the same (i.e. the degree sequence is preserved)\n", "def random_swap_edges(adj, nswap=1, max_tries=100):\n", " # use numpy and scipy to speed up the process\n", " adj = sp.csr_matrix(adj)\n", " n, m = adj.shape \n", " assert n == m # make sure the adjacency matrix is square\n", " adj_triu = sp.triu(adj) # only consider the upper triangular part of the adjacency matrix\n", " adj_tuple = sp.find(adj_triu) # get the indices and values of the non-zero elements\n", " adj_edges = np.array(list(zip(adj_tuple[0], adj_tuple[1]))) # get the edges\n", " adj_data = adj_tuple[2] # get the edge weights\n", " nnz = adj_edges.shape[0] # number of non-zero elements\n", " assert nnz == adj_data.shape[0] # make sure the number of edges and edge weights are the same\n", " for _ in range(nswap): # repeat nswap times\n", " # choose random edges to swap\n", " edge_idx = np.random.choice(nnz, size=2, replace=False) # choose two random edges\n", " edge1 = adj_edges[edge_idx[0]] # get the first edge\n", " edge2 = adj_edges[edge_idx[1]] # get the second edge\n", " # make sure the edges are not self-loops and not already connected\n", " if edge1[0] == edge2[0] or edge1[0] == edge2[1] or edge1[1] == edge2[0] or edge1[1] == edge2[1] or adj[edge1[0], edge2[1]] or adj[edge2[0], edge1[1]]: \n", " continue # if the edges are self-loops or already connected, try again\n", " # swap the edges\n", " adj[edge1[0], edge1[1]] = 0 \n", " adj[edge2[0], edge2[1]] = 0 \n", " adj[edge1[0], edge2[1]] = 1\n", " adj[edge2[0], edge1[1]] = 1\n", " # update adj_edges and adj_data\n", " adj_edges[edge_idx[0]] = [edge1[0], edge2[1]]\n", " adj_edges[edge_idx[1]] = [edge2[0], edge1[1]]\n", " adj_data[edge_idx[0]] = 1\n", " adj_data[edge_idx[1]] = 1\n", " return adj\n", "\n", "adj_swapped = random_swap_edges(adj, nswap=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create a new graph from the swapped adjacency matrix\n", "G_swapped = nx.from_scipy_sparse_matrix(adj_swapped)\n", "# print info about the graph and the matrix\n", "print(\"Number of nodes: \", G_swapped.number_of_nodes())\n", "print(\"Number of edges: \", G_swapped.number_of_edges())\n", "print(\"Average degree: \", np.mean([d for n, d in G_swapped.degree()]))\n", "print(\"Average clustering coefficient: \", nx.average_clustering(G_swapped))\n", "print(\"Average shortest path length: \", nx.average_shortest_path_length(G_swapped))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.8 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } } }, "nbformat": 4, "nbformat_minor": 2 }