small-worlds/testing.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "import wget\n",
    "import zipfile\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import plotly.graph_objects as go\n",
    "from utils import *\n",
    "from collections import Counter\n",
    "from tqdm import tqdm\n",
    "import time\n",
    "\n",
    "# ignore warnings\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Graph</th>\n",
       "      <th>Number of Nodes</th>\n",
       "      <th>Number of Edges</th>\n",
       "      <th>Average Degree</th>\n",
       "      <th>Average Clustering Coefficient</th>\n",
       "      <th>log N</th>\n",
       "      <th>Average Shortest Path Length</th>\n",
       "      <th>betweenness centrality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brightkite Checkins Graph</td>\n",
       "      <td>7191</td>\n",
       "      <td>3663807</td>\n",
       "      <td>1018.997914</td>\n",
       "      <td>0.702854</td>\n",
       "      <td>8.880586</td>\n",
       "      <td>2.411011</td>\n",
       "      <td>0.00022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Gowalla Checkins Graph</td>\n",
       "      <td>10702</td>\n",
       "      <td>303104</td>\n",
       "      <td>56.644366</td>\n",
       "      <td>0.505597</td>\n",
       "      <td>9.278186</td>\n",
       "      <td>5.222903</td>\n",
       "      <td>0.000301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Foursquare EU Checkins Graph</td>\n",
       "      <td>20282</td>\n",
       "      <td>7430376</td>\n",
       "      <td>732.706439</td>\n",
       "      <td>0.597097</td>\n",
       "      <td>9.917489</td>\n",
       "      <td>2.2843</td>\n",
       "      <td>0.000089</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Foursquare IT Checkins Graph</td>\n",
       "      <td>3730</td>\n",
       "      <td>629749</td>\n",
       "      <td>337.667024</td>\n",
       "      <td>0.683565</td>\n",
       "      <td>8.224164</td>\n",
       "      <td>2.185477</td>\n",
       "      <td>0.000428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Brightkite Friendship Graph</td>\n",
       "      <td>5928</td>\n",
       "      <td>34673</td>\n",
       "      <td>11.698043</td>\n",
       "      <td>0.219749</td>\n",
       "      <td>8.687442</td>\n",
       "      <td>5.052162</td>\n",
       "      <td>0.000448</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>(Filtered) Gowalla Friendship Graph</td>\n",
       "      <td>8396</td>\n",
       "      <td>29122</td>\n",
       "      <td>6.937113</td>\n",
       "      <td>0.217544</td>\n",
       "      <td>9.035511</td>\n",
       "      <td>4.558532</td>\n",
       "      <td>0.000357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Foursquare IT Friendship Graph</td>\n",
       "      <td>2073</td>\n",
       "      <td>6217</td>\n",
       "      <td>5.99807</td>\n",
       "      <td>0.148489</td>\n",
       "      <td>7.636752</td>\n",
       "      <td>19.530752</td>\n",
       "      <td>0.000879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Foursquare EU Friendship Graph</td>\n",
       "      <td>16491</td>\n",
       "      <td>59419</td>\n",
       "      <td>7.206234</td>\n",
       "      <td>0.167946</td>\n",
       "      <td>9.710570</td>\n",
       "      <td>23.713864</td>\n",
       "      <td>0.000272</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 Graph Number of Nodes Number of Edges  \\\n",
       "0            Brightkite Checkins Graph            7191         3663807   \n",
       "1               Gowalla Checkins Graph           10702          303104   \n",
       "2         Foursquare EU Checkins Graph           20282         7430376   \n",
       "3         Foursquare IT Checkins Graph            3730          629749   \n",
       "4          Brightkite Friendship Graph            5928           34673   \n",
       "5  (Filtered) Gowalla Friendship Graph            8396           29122   \n",
       "6       Foursquare IT Friendship Graph            2073            6217   \n",
       "7       Foursquare EU Friendship Graph           16491           59419   \n",
       "\n",
       "  Average Degree Average Clustering Coefficient     log N  \\\n",
       "0    1018.997914                       0.702854  8.880586   \n",
       "1      56.644366                       0.505597  9.278186   \n",
       "2     732.706439                       0.597097  9.917489   \n",
       "3     337.667024                       0.683565  8.224164   \n",
       "4      11.698043                       0.219749  8.687442   \n",
       "5       6.937113                       0.217544  9.035511   \n",
       "6        5.99807                       0.148489  7.636752   \n",
       "7       7.206234                       0.167946  9.710570   \n",
       "\n",
       "  Average Shortest Path Length betweenness centrality  \n",
       "0                     2.411011                0.00022  \n",
       "1                     5.222903               0.000301  \n",
       "2                       2.2843               0.000089  \n",
       "3                     2.185477               0.000428  \n",
       "4                     5.052162               0.000448  \n",
       "5                     4.558532               0.000357  \n",
       "6                    19.530752               0.000879  \n",
       "7                    23.713864               0.000272  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import the graphs from the saved files\n",
    "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
    "G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n",
    "G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n",
    "G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n",
    "\n",
    "G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n",
    "G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n",
    "G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n",
    "G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n",
    "\n",
    "# open the dataframe object\n",
    "analysis_results = pd.read_pickle('analysis_results.pkl')\n",
    "analysis_results"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first thing that we want to do is very simple, create a random reference for each graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
    "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
    "\n",
    "graphs_all = checkins_graphs + friendships_graph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Random shit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "# analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "# for graph in graphs_all:\n",
    "#     print(\"\\nCreating random graph for graph: \", graph.name)\n",
    "#     G_erd = create_random_graphs(graph, model='erdos', save=False)\n",
    "#     G_ws = create_random_graphs(graph, model='watts_strogatz', save=False)\n",
    "    \n",
    "#     # add the basic information to the dataframe\n",
    "#     analysis_results_erods = analysis_results_erods.append({\n",
    "#         'Graph': G_erd.name,\n",
    "#         'Number of Nodes': G_erd.number_of_nodes(),\n",
    "#         'Number of Edges': G_erd.number_of_edges(),\n",
    "#         'log N': np.log(G_erd.number_of_nodes())\n",
    "#     }, ignore_index=True)\n",
    "\n",
    "#     # add the basic information to the dataframe\n",
    "#     analysis_results_ws = analysis_results_ws.append({\n",
    "#         'Graph': G_ws.name,\n",
    "#         'Number of Nodes': G_ws.number_of_nodes(),\n",
    "#         'Number of Edges': G_ws.number_of_edges(),\n",
    "#         'log N': np.log(G_ws.number_of_nodes())\n",
    "#     }, ignore_index=True)\n",
    "\n",
    "#     # compute the average degree and add it to the dataframes\n",
    "#     avg_deg_erd = np.mean([d for n, d in G_erd.degree()])\n",
    "#     avg_deg_ws = np.mean([d for n, d in G_ws.degree()])\n",
    "#     analysis_results_erods.loc[analysis_results_erods['Graph'] == G_erd.name, 'Average Degree'] = avg_deg_erd\n",
    "#     analysis_results_ws.loc[analysis_results_ws['Graph'] == G_ws.name, 'Average Degree'] = avg_deg_ws\n",
    "\n",
    "#     # compute the average clustering coefficient and add it to the dataframes\n",
    "#     avg_clustering_erd = average_clustering_coefficient(G_erd, k = 0.9)\n",
    "#     avg_clustering_ws = average_clustering_coefficient(G_ws, k = 0.9)\n",
    "#     analysis_results_erods.loc[analysis_results_erods['Graph'] == G_erd.name, 'Average Clustering Coefficient'] = avg_clustering_erd\n",
    "#     analysis_results_ws.loc[analysis_results_ws['Graph'] == G_ws.name, 'Average Clustering Coefficient'] = avg_clustering_ws\n",
    "\n",
    "#     # compute the average shortest path length and add it to the dataframes\n",
    "#     average_shortest_path_length_erd = average_shortest_path(G_erd, k = 0.9)\n",
    "#     average_shortest_path_length_ws = average_shortest_path(G_ws, k = 0.9)\n",
    "#     analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length_erd\n",
    "#     analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length_ws\n",
    "\n",
    "#     # compute the betweenness centrality and add it to the dataframes\n",
    "#     betweenness_centrality_erd = np.mean(list(betweenness_centrality_parallel(G_erd, 4, k = 0.9).values()))\n",
    "#     betweenness_centrality_ws = np.mean(list(betweenness_centrality_parallel(G_ws, 4, k = 0.9).values()))\n",
    "#     analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality_erd\n",
    "#     analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality_ws\n",
    "\n",
    "#     # save memory\n",
    "#     del G_erd, G_ws\n",
    "\n",
    "# analysis_results_erods.to_pickle('analysis_results_erods.pkl')\n",
    "# analysis_results_ws.to_pickle('analysis_results_ws.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Small Worldness\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have already computed the average clusting coefficient and the average shortesh path len for our networks. We can save a lot of time by skipping this computations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def omega(G, C_og, L_og, niter, nrand):\n",
    "    randMetrics = {\"C\": [], \"L\": []}\n",
    "\n",
    "    # Calculate initial average clustering coefficient which potentially will\n",
    "    # get replaced by higher clustering coefficients from generated lattice\n",
    "    # reference graphs\n",
    "    Cl = C_og\n",
    "\n",
    "    niter_lattice_reference = niter\n",
    "    niter_random_reference = niter * 2\n",
    "\n",
    "    for _ in range(nrand):\n",
    "        \n",
    "        # Generate random graph\n",
    "        Gr = nx.random_reference(G, niter=niter_random_reference, seed=42)\n",
    "        randMetrics[\"L\"].append(nx.average_shortest_path_length(Gr))\n",
    "\n",
    "        # Generate lattice graph\n",
    "        Gl = nx.lattice_reference(G, niter=niter_lattice_reference, seed=42)\n",
    "\n",
    "        # Replace old clustering coefficient, if clustering is higher in\n",
    "        # generated lattice reference\n",
    "        Cl_temp = nx.average_clustering(Gl)\n",
    "        if Cl_temp > Cl:\n",
    "            Cl = Cl_temp\n",
    "\n",
    "    C = C_og\n",
    "    L = L_og\n",
    "    Lr = np.mean(randMetrics[\"L\"])\n",
    "\n",
    "    omega = (Lr / L) - (C / Cl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Brightkite Checkins Graph\n"
     ]
    }
   ],
   "source": [
    "analysis_results = pd.read_pickle('analysis_results.pkl')\n",
    "\n",
    "omega_results = pd.DataFrame(columns=['Graph', 'omega'])\n",
    "\n",
    "for G in checkins_graphs:\n",
    "    print(G.name)\n",
    "    C_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Clustering Coefficient'].values[0]\n",
    "    L_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Shortest Path Length'].values[0]\n",
    "\n",
    "    omega = omega(G, C_og, L_og, 2, 3)\n",
    "    \n",
    "    omega_results = omega_results.append({\n",
    "        'Graph': G.name,\n",
    "        'omega': omega\n",
    "    }, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for G in friendships_graphs:\n",
    "    print(G.name)\n",
    "    C_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Clustering Coefficient'].values[0]\n",
    "    L_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Shortest Path Length'].values[0]\n",
    "\n",
    "    omega = omega(G, C_og, L_og, 2, 3)\n",
    "    \n",
    "    omega_results = omega_results.append({\n",
    "        'Graph': G.name,\n",
    "        'omega': omega\n",
    "    }, ignore_index=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.8 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}