small-worlds/testing.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "import wget\n",
    "import zipfile\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import plotly.graph_objects as go\n",
    "from utils import *\n",
    "from collections import Counter\n",
    "from tqdm import tqdm\n",
    "import time\n",
    "\n",
    "# ignore warnings\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import the graphs from the saved files\n",
    "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
    "G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n",
    "G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n",
    "G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n",
    "\n",
    "G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n",
    "G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n",
    "G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n",
    "G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n",
    "\n",
    "# # open the dataframe object\n",
    "# analysis_results = pd.read_pickle('analysis_results.pkl')\n",
    "# analysis_results"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first thing that we want to do is very simple, create a random reference for each graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
    "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
    "\n",
    "graphs_all = checkins_graphs + friendships_graph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Original Graphs\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for graph in graphs_all:\n",
    "    # add basic graph statistics\n",
    "    analysis_results = analysis_results.append(\n",
    "        {'Graph': graph.name, \n",
    "        'Number of Nodes': graph.number_of_nodes(), \n",
    "        'log N': np.log(graph.number_of_nodes()),\n",
    "        'Number of Edges': graph.number_of_edges()}, \n",
    "        ignore_index=True)\n",
    "\n",
    "    # add average degree\n",
    "    print(\"Computing average degree for graph: \", graph.name)\n",
    "    avg_deg = np.mean([d for n, d in graph.degree()])\n",
    "    analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Degree'] = avg_deg\n",
    "\n",
    "    # add average clustering coefficient\n",
    "    print(\"Computing average clustering coefficient for graph: \", graph.name)\n",
    "    avg_clustering = nx.average_clustering(graph)\n",
    "    analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Clustering Coefficient'] = avg_clustering\n",
    "\n",
    "    # add average shortest path length\n",
    "    print(\"Computing average shortest path length for graph: \", graph.name)\n",
    "    average_shortest_path_length = average_shortest_path(graph)\n",
    "    analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
    "\n",
    "    # add betweenness centrality\n",
    "    print(\"Computing betweenness centrality for graph: \", graph.name)\n",
    "    betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6).values()))\n",
    "    analysis_results.loc[analysis_results['Graph'] == graph.name, 'betweenness centrality'] = betweenness_centrality\n",
    "    print()\n",
    "\n",
    "\n",
    "analysis_results\n",
    "analysis_results.to_pickle('analysis_results.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Random shit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'NoneType' object has no attribute 'name'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m      6\u001b[0m G \u001b[38;5;241m=\u001b[39m create_random_graphs(graph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merods\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      8\u001b[0m \u001b[38;5;66;03m# add the basic information to the dataframe\u001b[39;00m\n\u001b[1;32m      9\u001b[0m analysis_results_erods \u001b[38;5;241m=\u001b[39m analysis_results_erods\u001b[38;5;241m.\u001b[39mappend({\n\u001b[0;32m---> 10\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mG\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m,\n\u001b[1;32m     11\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Nodes\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_nodes(),\n\u001b[1;32m     12\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Edges\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_edges(),\n\u001b[1;32m     13\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog N\u001b[39m\u001b[38;5;124m'\u001b[39m: np\u001b[38;5;241m.\u001b[39mlog(G\u001b[38;5;241m.\u001b[39mnumber_of_nodes())\n\u001b[1;32m     14\u001b[0m }, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# compute the average degree and add it to the dataframe\u001b[39;00m\n\u001b[1;32m     17\u001b[0m avg_deg \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmean([d \u001b[38;5;28;01mfor\u001b[39;00m n, d \u001b[38;5;129;01min\u001b[39;00m G\u001b[38;5;241m.\u001b[39mdegree()])\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'name'"
     ]
    }
   ],
   "source": [
    "analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "# read all the graphs gpickle files in the data/random/erdos folder. Then run the same analysis as before for this graphs\n",
    "\n",
    "for graph in graphs_all:\n",
    "    G = create_random_graphs(graph, \"erods\")\n",
    "\n",
    "    # add the basic information to the dataframe\n",
    "    analysis_results_erods = analysis_results_erods.append({\n",
    "        'Graph': G.name,\n",
    "        'Number of Nodes': G.number_of_nodes(),\n",
    "        'Number of Edges': G.number_of_edges(),\n",
    "        'log N': np.log(G.number_of_nodes())\n",
    "    }, ignore_index=True)\n",
    "\n",
    "    # compute the average degree and add it to the dataframe\n",
    "    avg_deg = np.mean([d for n, d in G.degree()])\n",
    "    analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Degree'] = avg_deg\n",
    "\n",
    "    # compute the average clustering coefficient and add it to the dataframe\n",
    "    avg_clustering = nx.average_clustering(G)\n",
    "    analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n",
    "\n",
    "    # compute the average shortest path length and add it to the dataframe\n",
    "    average_shortest_path_length = average_shortest_path(G)\n",
    "    analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
    "\n",
    "    # compute the betweenness centrality and add it to the dataframe\n",
    "    betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
    "    analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n",
    "\n",
    "    # save memory\n",
    "    del G\n",
    "\n",
    "analysis_results_erods.to_pickle('analysis_results_erods.pkl')\n",
    "analysis_results_erods\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tNumber of edges in the original graph: 3663807\n",
      "\tNumber of edges in the random graph: 3660219\n"
     ]
    },
    {
     "ename": "UnboundLocalError",
     "evalue": "local variable 'G_copy' referenced before assignment",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mUnboundLocalError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 25\u001b[0m\n\u001b[1;32m     22\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Clustering Coefficient\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m avg_clustering\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# compute the average shortest path length and add it to the dataframe\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m average_shortest_path_length \u001b[38;5;241m=\u001b[39m \u001b[43maverage_shortest_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     26\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Shortest Path Length\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m average_shortest_path_length\n\u001b[1;32m     28\u001b[0m \u001b[38;5;66;03m# compute the betweenness centrality and add it to the dataframe\u001b[39;00m\n",
      "File \u001b[0;32m~/github/small-worlds/utils.py:497\u001b[0m, in \u001b[0;36maverage_shortest_path\u001b[0;34m(G, k)\u001b[0m\n\u001b[1;32m    494\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m\\t\u001b[39;00m\u001b[39mNumber of edges after removing \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m% o\u001b[39;00m\u001b[39mf nodes: \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m.\u001b[39mformat((k)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m, G_copy\u001b[39m.\u001b[39mnumber_of_edges()))\n\u001b[1;32m    496\u001b[0m tmp \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m--> 497\u001b[0m connected_components \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(nx\u001b[39m.\u001b[39mconnected_components(G_copy))\n\u001b[1;32m    498\u001b[0m \u001b[39m# remove all the connected components with less than 10 nodes\u001b[39;00m\n\u001b[1;32m    499\u001b[0m connected_components \u001b[39m=\u001b[39m [c \u001b[39mfor\u001b[39;00m c \u001b[39min\u001b[39;00m connected_components \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(c) \u001b[39m>\u001b[39m \u001b[39m10\u001b[39m]\n",
      "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'G_copy' referenced before assignment"
     ]
    }
   ],
   "source": [
    "# do the same with the watts strogatz graphs\n",
    "\n",
    "analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "for graph in graphs_all:\n",
    "    G = create_random_graphs(graph, 'watts_strogatz', save=False)\n",
    "\n",
    "    # add the basic information to the dataframe\n",
    "    analysis_results_ws = analysis_results_ws.append({\n",
    "        'Graph': G.name,\n",
    "        'Number of Nodes': G.number_of_nodes(),\n",
    "        'Number of Edges': G.number_of_edges(),\n",
    "        'log N': np.log(G.number_of_nodes())\n",
    "    }, ignore_index=True)\n",
    "\n",
    "    # compute the average degree and add it to the dataframe\n",
    "    avg_deg = np.mean([d for n, d in G.degree()])\n",
    "    analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Degree'] = avg_deg\n",
    "\n",
    "    # compute the average clustering coefficient and add it to the dataframe\n",
    "    avg_clustering = nx.average_clustering(G)\n",
    "    analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n",
    "\n",
    "    # compute the average shortest path length and add it to the dataframe\n",
    "    average_shortest_path_length = average_shortest_path(G)\n",
    "    analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
    "\n",
    "    # compute the betweenness centrality and add it to the dataframe\n",
    "    betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
    "    analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n",
    "\n",
    "    # save memory\n",
    "    del G\n",
    "\n",
    "analysis_results_ws.to_pickle('analysis_results_ws.pkl')\n",
    "analysis_results_ws"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "G = nx.watts_strogatz_graph(1000, 4, 0.1)\n",
    "adj = nx.to_scipy_sparse_array(G)\n",
    "# print info about the graph and the matrix\n",
    "print(\"Number of nodes: \", G.number_of_nodes())\n",
    "print(\"Number of edges: \", G.number_of_edges())\n",
    "print(\"Average degree: \", np.mean([d for n, d in G.degree()]))\n",
    "print(\"Average clustering coefficient: \", nx.average_clustering(G))\n",
    "print(\"Average shortest path length: \", nx.average_shortest_path_length(G))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import scipy.sparse as sp\n",
    "\n",
    "# randomly swap edges, but keep the degree of each node the same (i.e. the degree sequence is preserved)\n",
    "def random_swap_edges(adj, nswap=1, max_tries=100):\n",
    "    # use numpy and scipy to speed up the process\n",
    "    adj = sp.csr_matrix(adj)\n",
    "    n, m = adj.shape \n",
    "    assert n == m # make sure the adjacency matrix is square\n",
    "    adj_triu = sp.triu(adj) # only consider the upper triangular part of the adjacency matrix\n",
    "    adj_tuple = sp.find(adj_triu) # get the indices and values of the non-zero elements\n",
    "    adj_edges = np.array(list(zip(adj_tuple[0], adj_tuple[1]))) # get the edges\n",
    "    adj_data = adj_tuple[2] # get the edge weights\n",
    "    nnz = adj_edges.shape[0] # number of non-zero elements\n",
    "    assert nnz == adj_data.shape[0] # make sure the number of edges and edge weights are the same\n",
    "    for _ in range(nswap): # repeat nswap times\n",
    "        # choose random edges to swap\n",
    "        edge_idx = np.random.choice(nnz, size=2, replace=False) # choose two random edges\n",
    "        edge1 = adj_edges[edge_idx[0]] # get the first edge\n",
    "        edge2 = adj_edges[edge_idx[1]] # get the second edge\n",
    "        # make sure the edges are not self-loops and not already connected\n",
    "        if edge1[0] == edge2[0] or edge1[0] == edge2[1] or edge1[1] == edge2[0] or edge1[1] == edge2[1] or adj[edge1[0], edge2[1]] or adj[edge2[0], edge1[1]]: \n",
    "            continue # if the edges are self-loops or already connected, try again\n",
    "        # swap the edges\n",
    "        adj[edge1[0], edge1[1]] = 0 \n",
    "        adj[edge2[0], edge2[1]] = 0     \n",
    "        adj[edge1[0], edge2[1]] = 1\n",
    "        adj[edge2[0], edge1[1]] = 1\n",
    "        # update adj_edges and adj_data\n",
    "        adj_edges[edge_idx[0]] = [edge1[0], edge2[1]]\n",
    "        adj_edges[edge_idx[1]] = [edge2[0], edge1[1]]\n",
    "        adj_data[edge_idx[0]] = 1\n",
    "        adj_data[edge_idx[1]] = 1\n",
    "    return adj\n",
    "\n",
    "adj_swapped = random_swap_edges(adj, nswap=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a new graph from the swapped adjacency matrix\n",
    "G_swapped = nx.from_scipy_sparse_matrix(adj_swapped)\n",
    "# print info about the graph and the matrix\n",
    "print(\"Number of nodes: \", G_swapped.number_of_nodes())\n",
    "print(\"Number of edges: \", G_swapped.number_of_edges())\n",
    "print(\"Average degree: \", np.mean([d for n, d in G_swapped.degree()]))\n",
    "print(\"Average clustering coefficient: \", nx.average_clustering(G_swapped))\n",
    "print(\"Average shortest path length: \", nx.average_shortest_path_length(G_swapped))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.8 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}