now we can create edge list in python, started the analysis

2 years ago · b218e00309
parent 4a044b5e8d
commit b218e00309
2 changed files with 466 additions and 350 deletions
--- a/main.py
+++ b/main.py
@ -1,221 +0,0 @@
 """
 NOTEs:
 - This file is note meant to be run, it's just a collection of functions that are used in the other files. It's just a way to keep the code clean and organized.
 - Why do I use os.path.join and not the "/"? Because it's more portable, it works on every OS, while "/" works only on Linux and Mac. If you want to use it on Windows, you have to change all the "/" with "\". With os.path.join you don't have to worry about it and, as always, f*** Microsoft.
 """
 import os
 import wget
 import zipfile
 import pandas as pd
 import networkx as nx
 from typing import Literal
 from itertools import combinations
 # ------------------------------------------------------------------------#
 def download_datasets():
    urls = [
        ["https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz", "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"],
        ["https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz", "https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz"],
        ["http://www-public.it-sudparis.eu/~zhang_da/pub/dataset_tsmc2014.zip"]
    ]
    folders = ["brightkite", "gowalla", "foursquare"]
    # check if the data folder exists
    if not os.path.exists("data"):
        os.mkdir("data")
    # if they don't exist, create 3 subfolders in data called brightkite, gowalla and foursquare
    for folder in folders:
        if not os.path.exists(os.path.join("data", folder)):
            os.mkdir(os.path.join("data", folder))
    # download every url in urls[0] in the brightkite folder, and every url in urls[1] in the gowalla folder, and every url in urls[2] in the foursquare folder. If the file is already downloaded, skip the download
    for i in range(len(urls)):
        for url in urls[i]:
            if not os.path.exists(os.path.join("data", folders[i], url.split("/")[-1])):
                wget.download(url, os.path.join("data", folders[i]))
    # unzip the .gz files inside the brightkite and gowalla folders
    for file in os.listdir(os.path.join("data", "brightkite")):
        if file.endswith(".gz"):
            os.system("gunzip {}".format(os.path.join("data", "brightkite", file)))
    for file in os.listdir(os.path.join("data", "gowalla")):
        if file.endswith(".gz"):
            os.system("gunzip {}".format(os.path.join("data", "gowalla", file)))
    # extract the data of foursquare in a nice way, checking all the edge cases as a maniac. More details below
    """
    The code below it's very ugly to read, but it's effective. Basically, in every possible messy situation we have the files (maybe after testing) inside the foursquare folder, it will fix them and bring them as the program expects them to be.
    Firstly it checks if in the foursquare folder there is a folder called dataset_tsmc2014. If true, it checks if there are 3 files inside the foursquare folders, if yes, skip the process (everything is in order). If false, it moves all the files inside the dataset_tsmc2014 folder to the foursquare folder and delete the dataset_tsmc2014 folder (we don't want a nested folder)
    Then, if there is no dataset_tsmc2014 folder, it unzips the file. Then move all the .txt files inside the dataset_tsmc2014 folder in the foursquare folder. Then delete the dataset_tsmc2014 folder and the .zip file.
    """
    for file in os.listdir(os.path.join("data", "foursquare")):
        if file.endswith(".zip"):
            if os.path.exists(os.path.join("data", "foursquare", "dataset_tsmc2014")):
                if len(os.listdir(os.path.join("data", "foursquare"))) == 3:
                    pass
                else:
                    for file in os.listdir(os.path.join("data", "foursquare", "dataset_tsmc2014")):
                        os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file))
                    os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014"))
            else:
                with zipfile.ZipFile(os.path.join("data", "foursquare", file), 'r') as zip_ref:
                    zip_ref.extractall(os.path.join("data", "foursquare"))
                    os.remove(os.path.join("data", "foursquare", file))
                for file in os.listdir(os.path.join("data", "foursquare", "dataset_tsmc2014")):
                    os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file))
                os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014"))
    # Now we want to clean our data. Both for brightkite and gowalla, we want to rename _edges files as "brightkite_friends_edges.txt" and "gowalla_friends_edges.txt"
    for file in os.listdir(os.path.join("data", "brightkite")):
        if file.endswith("_edges.txt"):
            os.rename(os.path.join("data", "brightkite", file), os.path.join("data", "brightkite", "brightkite_friends_edges.txt"))
    for file in os.listdir(os.path.join("data", "gowalla")):
        if file.endswith("_edges.txt"):
            os.rename(os.path.join("data", "gowalla", file), os.path.join("data", "gowalla", "gowalla_friends_edges.txt"))
    # Now we from the _totalCheckins.txt files we want to keep only the first and last column, which are the user ID and the venue ID. We also want to remove the header of the file. Use pandas to do that. Then rename the files as "brightkite_checkins_edges.txt" and "gowalla_checkins_edges.txt
    for file in os.listdir(os.path.join("data", "brightkite")):
        if file.endswith("_totalCheckins.txt"):
            df = pd.read_csv(os.path.join("data", "brightkite", file), sep="\t", header=None)
            df = df[[0, 2]]
            df.to_csv(os.path.join("data", "brightkite", "brightkite_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8")
            os.remove(os.path.join("data", "brightkite", file))
    for file in os.listdir(os.path.join("data", "gowalla")):
        if file.endswith("_totalCheckins.txt"):
            df = pd.read_csv(os.path.join("data", "gowalla", file), sep="\t", header=None)
            df = df[[0, 2]]
            df.to_csv(os.path.join("data", "gowalla", "gowalla_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8")
            os.remove(os.path.join("data", "gowalla", file))
    # now for foursquare we want to keep only the first and second column, which are the user ID and the venue ID. We also want to remove the header of the file. Use pandas to do that. Do that for both _NYC.txt and _TKY.txt files. Then rename the files as "foursquare_checkins_edges_NYC.txt" and "foursquare_checkins_edges_TKY.txt
    for file in os.listdir(os.path.join("data", "foursquare")):
        if file.endswith("_NYC.txt"):
            df = pd.read_csv(os.path.join("data", "foursquare", file), sep="\t", header=None, encoding="utf-8", encoding_errors="ignore")
            df = df[[0, 1]]
            df.to_csv(os.path.join("data", "foursquare", "foursquare_checkins_NYC.txt"), sep="\t", header=False, index=False)
            os.remove(os.path.join("data", "foursquare", file))
        if file.endswith("_TKY.txt"):
            df = pd.read_csv(os.path.join("data", "foursquare", file), sep="\t", header=None, encoding="utf-8", encoding_errors="ignore")
            df = df[[0, 1]]
            df.to_csv(os.path.join("data", "foursquare", "foursquare_checkins_TKY.txt"), sep="\t", header=False, index=False)
            os.remove(os.path.join("data", "foursquare", file))
 # ------------------------------------------------------------------------#
 def create_checkins_graph_SLOW(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY'])-> nx.Graph:
    """
    This function takes in input a tsv file, each line in the file is a check-in. The function returns an undirected networkx graph object.
    Firstly, we retrive the unique user ID: this are the nodes of our graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues.
    """
    if dataset not in ['brightkite', 'gowalla',
     'foursquareNYC', 'foursquareTKY']:
        raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquareNYC, foursquareTKY")
    # based on the dataset, we have to read the file in a different way.
    if dataset == "foursquareNYC":
        file = os.path.join("data", "foursquare", "dataset_TSMC2014_NYC.txt")
        df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore")
    elif dataset == "foursquareTKY":
        file = os.path.join("data", "foursquare", "dataset_TSMC2014_TKY.txt")
        df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore")
    else:
        file = os.path.join("data", dataset, "loc-{}_totalCheckins.txt".format(dataset))
        df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "CheckIn", "latitude", "longitude", "VenueID"], encoding="utf-8", encoding_errors="ignore")
    # get the unique users ID
    users = df["UserID"].unique()
    G = nx.Graph()
    G.add_nodes_from(users)
    print("Number of nodes added to the graph {}: {}".format(dataset, G.number_of_nodes()))
    users_venues = df.groupby("UserID")["VenueID"].apply(list).to_dict()
    for user1, user2 in combinations(users, 2):
        intersection = set(users_venues[user1]) & set(users_venues[user2])
        if len(intersection) > 0:
            G.add_edge(user1, user2, weight=len(intersection))
    print("Number of edges added to the graph {}: {}".format(dataset, G.number_of_edges()))
 # ------------------------------------------------------------------------#
 def friendships_graph(dataset: Literal['brightkite', 'gowalla']) -> nx.Graph:
    """
    This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. It uses pandas to read the file since it's faster than the standard python open() function. If we don't want to use the standard python open() function, the following code works as well:
    G = nx.Graph()
    with open(file, "r") as f:
        for line in f:
            node1, node2 = line.split("\t")
            G.add_edge(node1, node2)
    """
    if dataset not in ["brightkite", "gowalla"]:
        raise ValueError("The dataset must be brightkite or gowalla")
    file = os.path.join("data", dataset, "{}_friends_edges.txt".format(dataset))
    df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
    G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
    return G
 # ------------------------------------------------------------------------#
 def checkins_graph_from_edges(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY']) -> nx.Graph:
    """
    This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. It uses pandas to read the file since it's faster than the standard python open() function. If we don't want to use the standard python open() function, the following code works as well:
    G = nx.Graph()
    with open(file, "r") as f:
        for line in f:
            node1, node2 = line.split("\t")
            G.add_edge(node1, node2)
    """
    if dataset not in ["brightkite", "gowalla", "foursquareNYC", "foursquareTKY"]:
        raise ValueError("The dataset must be brightkite, gowalla or foursquare")
    file = os.path.join("data", dataset, "{}_checkins_edges.tsv".format(dataset))
    # if dataset == "foursquareTKY":
    #     file = os.path.join("data", "foursquare", "foursquareNYC_checkins_graph.tsv")
    # elif dataset == "foursquareNYC":
    #     file = os.path.join("data", "foursquare", "foursquareTKY_checkins_graph.tsv")
    # else:
    #     file = os.path.join("data", dataset, "{}_checkins_graph.tsv".format(dataset))
    df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
    G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
    return G
--- a/testing.ipynb
+++ b/testing.ipynb
@ -6,14 +6,24 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "%reload_ext autoreload\n",
+    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "import zipfile\n",
    "import wget\n",
    "import zipfile\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import networkx as nx\n",
-    "from main import *\n",
+    "import plotly.graph_objects as go\n",
-    "import pandas as pd"
+    "from utils import *\n",
    "from collections import Counter\n",
    "from tqdm import tqdm\n",
    "import time\n",
    "\n",
    "# ignore warnings\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
@ -37,12 +47,14 @@
    "We can download the datasets using the function `download_dataset` from the `utils` module. It will download the datasets in the `data` folder, organized in sub-folders in the following way:\n",
    "\n",
    "```\n",
    "data\n",
    "├── brightkite\n",
-    "│   ├── brightkite_checkins.txt\n",
+    "│   ├── brightkite_checkins.txt\n",
-    "│   └── brightkite_friends_edges.txt\n",
+    "│   └── brightkite_friends_edges.txt\n",
    "├── foursquare\n",
-    "│   ├── foursquare_checkins_NYC.txt\n",
+    "│   ├── foursquare_checkins.txt\n",
-    "│   ├── foursquare_checkins_TKY.txt\n",
+    "│   ├── foursquare_friends_edges.txt\n",
    "│   └── raw_POIs.txt\n",
    "└── gowalla\n",
    "    ├── gowalla_checkins.txt\n",
    "    └── gowalla_friends_edges.txt\n",
@ -50,12 +62,17 @@
    "\n",
    "If any of the datasets is already downloaded, it will not be downloaded again. For further details about the function below, please refer to the `utils` module.\n",
    "\n",
-    "> NOTE: the Stanford servers tends to be slow, so it may take a while to download the datasets. It's gonna take about 2 to 3 minutes to download all the datasets."
+    "> NOTE: the Stanford servers tends to be slow, so it may take a while to download the datasets. It's gonna take about 5 minutes to download all the datasets.\n",
    "\n",
    "---\n",
    "\n",
    "### A deeper look at the datasets\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -71,10 +88,12 @@
    "\n",
    "## Brightkite\n",
    "\n",
-    "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. We will work with two different datasets:\n",
+    "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. We will work with two different datasets. This is how they look like after being filtered by the `download_dataset` function:\n",
    "\n",
    "- `data/brightkite/brightkite_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This file it's untouched by the function, it's in the form of a graph edge list.\n",
    "\n",
    "\n",
-    "- `data/brightkite/brightkite_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids\n",
+    "- `data/brightkite/brightkite_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph. Originally there were other columns, but we will not use them."
    "- `data/brightkite/brightkite_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph."
   ]
  },
  {
@ -84,10 +103,11 @@
   "source": [
    "## Gowalla\n",
    "\n",
-    "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. As for Brightkite, we will work with two different datasets:\n",
+    "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. As for Brightkite, we will work with two different datasets. This is how they look like after being filtered by the `download_dataset` function:\n",
    "\n",
-    "- `data/gowalla/gowalla_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids\n",
+    "- `data/gowalla/gowalla_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list. Originally there were other columns, such as the time of the checkins. During the filtering, we used this information to extract only the checkins from 2009 and then deleted it. This is why the number of checkins is smaller than the original dataset. \n",
-    "- `data/gowalla/gowalla_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph."
+    "\n",
    "- `data/gowalla/gowalla_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This file it's untouched by the function, it's in the form of a graph edge list. In the next section when we will build the friendship network, we will only consider the users that have at least one check-in in 2009 to avoid having biases in the analysis."
   ]
  },
  {
@ -97,12 +117,15 @@
   "source": [
    "## Foursquare\n",
    "\n",
-    "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 10 months) check-in data in New York city and Tokyo collected from Foursquare from 12 April 2012 to 16 February 2013. It contains two files in tsv format. Each file contains 2 columns, which are:\n",
+    "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 22 months from Apr. 2012 to Jan. 2014) global-scale check-in data collected from Foursquare, and also two snapshots of user social networks before and after the check-in data collection period (see more details in our paper). We will work with three different datasets:\n",
    "\n",
    "- `data/foursquare/foursquare_checkins.txt`: a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list. This fill will remain untouched by the function but due to its size, in the next sections we will focus on the EU sub-sample and the IT sub-sample. The friendship edge list will be modified accordingly.\n",
    "\n",
-    "1. User ID (anonymized)\n",
+    "- `data/foursquare/foursquare_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This is in the form of a graph edge list. \n",
    "2. Venue ID (Foursquare)\n",
    "\n",
-    "In this case, we don't have any information about the friendship network, so we will only work with the checkins."
+    "- `data/foursquare/raw_POIs.txt`: the POIS, a tsv file with 2 columns of location and country ISO. We are going to use this file to create the sub-samples of the dataset.\n",
    "\n",
    "> **NOTE:** In this case I preferred not to take sub-samples based on time. The reason is that there may be a period of time where the social network was not very popular in some countries, so the analysis may be biased. Instead, I decided to take sub-samples based on the country. In this way I have a more homogeneous dataset."
   ]
  },
  {
@ -117,39 +140,16 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "We are asked to construct the networks for the three datasets as un undirected graph $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n",
+    "We are asked to construct the networks for the three datasets as an undirected graph $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n",
    "\n",
-    "And this is were the fun begins! The check-ins files of the three datasets are not in the form of a graph edge list, so we need to manipulate them. But those datasets are huge! Let's have a look at the number of lines of each file."
+    "The check-ins files of the three datasets are not in the form of a graph edge list, so we need to manipulate them. Let's have a look at the number of lines of each file (note that gowalla is already filtered, only 2009 data are present)."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
-   "metadata": {},
+   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gowalla\n",
      "Number of lines:  6442892\n",
      "Number of unique elements:  107092\n",
      "\n",
      "brightkite\n",
      "Number of lines:  4747287\n",
      "Number of unique elements:  51406\n",
      "\n",
      "foursquare\n",
      "Number of lines:  227428\n",
      "Number of unique elements:  1083\n",
      "\n",
      "foursquare\n",
      "Number of lines:  573703\n",
      "Number of unique elements:  2293\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def count_lines_and_unique_elements(file):\n",
    "    df = pd.read_csv(file, sep='\\t', header=None)\n",
@ -158,10 +158,9 @@
    "\n",
    "gowalla_path = os.path.join('data', 'gowalla', 'gowalla_checkins.txt')\n",
    "brightkite_path = os.path.join('data', 'brightkite', 'brightkite_checkins.txt')\n",
-    "foursquareNYC_path = os.path.join('data', 'foursquare', 'foursquare_checkins_NYC.txt')\n",
+    "foursquare_path = os.path.join('data', 'foursquare', 'foursquare_checkins.txt')\n",
    "foursquareTKY_path = os.path.join('data', 'foursquare', 'foursquare_checkins_TKY.txt')\n",
    "\n",
-    "_ = [gowalla_path, brightkite_path, foursquareNYC_path, foursquareTKY_path]\n",
+    "_ = [gowalla_path, brightkite_path, foursquare_path]\n",
    "\n",
    "for path in _:\n",
    "    print(path.split(os.sep)[-2])\n",
@ -174,43 +173,91 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "We would like to build a graph starting from an edge list. So the basic idea is to create a dictionary where the keys are the unique users and the values are the locations that they visited. Then, we can iterate over the dictionary and create the edges.\n",
+    "We would like to build a graph starting from an edge list. To do that, we are going to check, for each venue, all the users that visited it. Then, we will create an edge between each pair of users that visited the same venue (avoid repetitions). This can be easily done in python, but it's going to be a bit slow (this is why we are considering sub-samples of the datasets). Let's see how to do it.\n",
    "\n",
    "But, even if we avoids repetitions, the time complexity will be $O(n^2)$, where $n$ is the number of users. And since $n$ is in the order of millions, doing this in python, where we have to build nested for loops, it's a no-go. We need to find a faster way to do this.\n",
    "\n",
-    "In the `utils` module I provided anyway a function that does exactly this, but I do not raccomend to use it unless you have countless hours of time spare. It's called `create_checkicreate_checkins_graph_SLOW` and it takes a dataset name as input and returns a networkx graph object. \n",
+    "```python\n",
    "# let df be the dataframe [\"user_id\", \"venue_id\"] of the checkins\n",
    "\n",
-    "SCRIVERE QUALCOSA RIGUARDO LA FUNZIONE IN C++\n",
+    "venues_users = df.groupby(\"venue_id\")[\"user_id\"].apply(set)\n",
    "\n",
-    "The function will output a new .tsv file in the form of an edge list, in the `data` folder. Since the C++ program needs to be compiled, I have already created the edge lists for the four datasets, so you can skip this step if you want.\n",
+    "        for users in venues_users:\n",
    "            for user1, user2 in combinations(users, 2):\n",
    "                G.add_edge(user1, user2)\n",
    "```\n",
    "\n",
-    "Once that we have our edge list, we can build the graph using the function `checkins_graph_from_edges` from the `utils` module. It takes as input the name of the dataset and returns a networkx graph object. The options are\n",
+    "It the `utilis.py` module, we have a function that does exactly this called `create_graph_from_checkins`. It takes as input the name of the dataset and returns a networkx graph object. By default it will also write the edge list to a file in the respective dataset folder. The options are\n",
    "\n",
    "- `brightkite`\n",
    "- `gowalla`\n",
-    "- `foursquareNYC`\n",
+    "- `foursquareEU`\n",
-    "- `foursquareTKY`\n",
+    "- `foursquareIT`\n",
    "\n",
    "Let's see how it works:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "G_brighkite_checkins = checkins_graph_from_edges('brightkite')\n",
+    "# It takes about 4 minutes to create the all the 4 graphs on a i7-8750H CPU\n",
    "\n",
    "G_brighkite_checkins = create_graph_from_checkins('brightkite')\n",
    "G_brighkite_checkins.name = 'Brightkite Checkins Graph'\n",
    "\n",
-    "G_gowalla_checkins = checkins_graph_from_edges('gowalla')\n",
+    "G_gowalla_checkins = create_graph_from_checkins('gowalla')\n",
    "G_gowalla_checkins.name = 'Gowalla Checkins Graph'\n",
    "\n",
-    "G_foursquareNYC_checkins = checkins_graph_from_edges('foursquareNYC')\n",
+    "G_foursquareEU_checkins = create_graph_from_checkins('foursquareEU')\n",
-    "G_foursquareNYC_checkins.name = 'Foursquare NYC Checkins Graph'\n",
+    "G_foursquareEU_checkins.name = 'Foursquare EU Checkins Graph'\n",
    "\n",
    "G_foursquareIT_checkins = create_graph_from_checkins('foursquareIT')\n",
    "G_foursquareIT_checkins.name = 'Foursquare IT Checkins Graph'"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Friendship network\n",
    "\n",
    "Now we want to create a graph where two users are connected if they are friends in the social network. We are intending the concept of friendship in a \"facebook way\", not a \"twitter way\". Less empirically, the graphs is not going to be directed and the edges are not going to be weighted. A user can't be friend with himself, and can't be friend with a user without the user being friend with him.\n",
    "\n",
    "Since we filtered the checkins for foursquare and gowalla, we are considering only the users that are also present in the check-ins graph. We can build this graph with the function `create_friendships_graph` in the `utils.py` module. It takes as input the name of the dataset and returns a networkx graph object. By default it will also write the edge list to a file in the respective dataset folder. The options are\n",
    "\n",
    "- `brightkite`\n",
    "- `gowalla`\n",
    "- `foursquareEU`\n",
    "- `foursquareIT`\n",
    "\n",
    "> **NOTE:** This functions is implemented without the necessity of the checkins graphs being loaded in memory, it uses the edge list file. This choice was made since someone may want to perform some analysis only on the friendship network and so there is no need to load the checkins graph and waste memory. Furthermore, networkx is tremendously slow when loading a graph from an edge list file (since it's written in pure python), so this choice is also motivated by the speed of the function.\n",
    "\n",
    "Let's see how it works:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "G_brighkite_friends = create_friendships_graph('brightkite')\n",
    "print(\"Computation done for Brightkite friendship graph\")\n",
    "G_brighkite_friends.name = 'Brightkite Friendship Graph'\n",
    "\n",
    "G_gowalla_friends = create_friendships_graph('gowalla')\n",
    "print(\"Computation done for (filtered) Gowalla friendship graph\")\n",
    "G_gowalla_friends.name = '(Filtered) Gowalla Friendship Graph'\n",
    "\n",
    "G_foursquareIT_friends = create_friendships_graph('foursquareIT')\n",
    "print(\"Computation done for Foursquare IT friendship graph\")\n",
    "G_foursquareIT_friends.name = 'Foursquare IT Friendship Graph'\n",
    "\n",
-    "G_foursquareTKY_checkins = checkins_graph_from_edges('foursquareTKY')\n",
+    "G_foursquareEU_friends = create_friendships_graph('foursquareEU')\n",
-    "G_foursquareTKY_checkins.name = 'Foursquare TKY Checkins Graph'"
+    "print(\"Computation done for Foursquare EU friendship graph\")\n",
    "G_foursquareEU_friends.name = 'Foursquare EU Friendship Graph'\n"
   ]
  },
  {
@ -223,34 +270,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
-   "metadata": {},
+   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
+   "source": [
-     "name": "stdout",
+    "for G in [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]:\n",
     "output_type": "stream",
     "text": [
      "Brightkite Checkins Graph\n",
      "Number of nodes:  44058\n",
      "Number of edges:  106699\n",
      "\n",
      "Gowalla Checkins Graph\n",
      "Number of nodes:  44058\n",
      "Number of edges:  106699\n",
      "\n",
      "Foursquare NYC Checkins Graph\n",
      "Number of nodes:  2293\n",
      "Number of edges:  31261\n",
      "\n",
      "Foursquare TKY Checkins Graph\n",
      "Number of nodes:  1078\n",
      "Number of edges:  7273\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for G in [G_brighkite_checkins, G_gowalla_checkins, G_foursquareNYC_checkins, G_foursquareTKY_checkins]:\n",
    "    print(G.name)\n",
    "    print('Number of nodes: ', G.number_of_nodes())\n",
    "    print('Number of edges: ', G.number_of_edges())\n",
@ -262,24 +286,71 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Friendship network\n",
+    "# Analysis of the structure of the networks\n",
    "<!-- \n",
    "Given a social network, which of its nodes are more central? This question has been asked many times in sociology, psychology and computer science, and a whole plethora of centrality measures (a.k.a. centrality indices, or rankings) were proposed to account for the importance of the nodes of a network. \n",
    "\n",
    "These networks, typically generated directly or indirectly by human activity and interaction (and therefore hereafter dubbed \"social\"), appear in a large variety of contexts and often exhibit a surprisingly similar structure. One of the most important notions that researchers have been trying to capture in such networks is “node centrality”: ideally, every node (often representing an individual) has some degree of influence or importance within the social domain under  consideration, and one expects such importance to surface in the structure of the social network; centrality is a quantitative measure that aims at revealing the importance of a node. \n",
    "\n",
    "Among the types of centrality that have been considered in the literature, many have to do with distances between nodes. Take, for instance, a node in an undirected connected network: if the sum of distances to all other nodes is large, the node under consideration is peripheral; this is the starting point to define Bavelas's closeness centrality, which is the reciprocal of peripherality (i.e., the reciprocal of the sum of distances to all other nodes). \n",
    "\n",
    "The role played by shortest paths is justified by one of the most well-known features of complex networks, the so-called **small-world phenomenon**. A small-world network is a graph where the average distance between nodes is logarithmic in the size of the network, whereas the clustering coefficient is larger (that is, neighborhoods tend to be denser) than in a random Erdős-Rényi graph with the same size and average distance. The fact that social networks (whether electronically mediated or not) exhibit the small-world property is known at least since Milgram's famous experiment and is arguably the most popular of all features of complex networks. For instance, the average distance of the Facebook graph was recently established to be just $4.74$ \n",
    "\n",
    "--- \n",
    "\n",
    "In 1998 Watts and Strogatz proposed a simple model for generating networks with the small-world property. The model is based on a regular lattice of $N$ nodes, where each node is connected to its $k$ nearest neighbors. The model then proceeds as follows: for each edge, the probability $p$ of rewiring it is considered. If the edge is rewired, it is replaced by a random edge with uniform probability. The resulting network is a small-world network with $N$ nodes, $k$ nearest neighbors, and average distance $\\log(N)/\\log(k)$. -->"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Degree distribution\n",
    "\n",
    "<!-- The Erdős-Rényi model has traditionally been the dominant subject of study in the field of random graphs. Recently, however, several studies of real-world networks have found that the ER model fails to reproduce many of their observed properties. One of the simplest properties of a network that can be measured directly is the degree distribution, or the fraction $P(k)$ of nodes having k connections (degree $k$). A well-known result for ER networks is that the degree distribution is Poissonian,\n",
    "\n",
    "\\begin{equation}\n",
    "    P(k) = \\frac{e^{z} z^k}{k!}\n",
    "\\end{equation}\n",
    "\n",
    "Where $z = \\langle k \\rangle$. is the average degree. Direct measurements of the degree distribution for real networks show that the Poisson law does not apply. Rather, often these nets exhibit a scale-free degree distribution:\n",
    "\n",
    "\\begin{equation}\n",
    "    P(k) = ck^{-\\gamma} \\quad \\text{for} \\quad k = m, ... , K\n",
    "\\end{equation}\n",
    "\n",
    "Where $c \\sim (\\gamma -1)m^{\\gamma - 1}$ is a normalization factor, and $m$ and $K$ are the lower and upper cutoffs for the degree of a node, respectively. The divergence of moments higher then $\\lceil \\gamma -1 \\rceil$ (as  $K \\to \\infty$ when $N \\to \\infty$) is responsible for many of the anomalous properties attributed to scale-free networks. \n",
    "\n",
-    "If we want to build the friendship network, fortunately for the gowalla and brightkite datasets we have the edge list, so we can just use the `read_edgelist` function from networkx. For the foursquare dataset, we don't have any information about the friendship of the users, so we will just create a graph with the checkins.\n",
+    "All real-world networks are finite and therefore all their moments are finite. The actual value of the cutoff K plays an important role. It may be approximated by noting that the total probability of nodes with $k > K$ is of order $1/N$\n",
    "\n",
-    "To build the friendship network of the first two datasets, we can use the `create_friends_graph` function from the `utils` module. It takes a dataset name as input and returns a networkx graph object. The implementation is pretty straightforward, we just use the `from_pandas_edgelist` function from networkx."
+    "\\begin{equation}\n",
    "    \\int_K^\\infty P(k) dk \\sim \\frac{1}{N}\n",
    "\\end{equation}\n",
    "\n",
    "This yields the result\n",
    "\n",
    "\\begin{equation}\n",
    "    K \\sim m N^{1/(\\gamma -1)}\n",
    "\\end{equation}\n",
    "\n",
    "The degree distribution alone is not enough to characterize the network. There are many other quantities, such as the degree-degree correlation (between connected nodes), the spatial correlations, the clustering coefficient, the betweenness or central-ity distribution, and the self-similarity exponents.\n",
    "\n",
    "---\n",
    "\n",
    "Let's see if our networks are scale-free or not. We can use the `degree_distribution` function from the `utils` module to plot the degree distribution of a graph. It takes a networkx graph object as input and returns a plot of the degree distribution. We expect to see a power-law distribution and not a Poissonian one. -->"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "G_brighkite_friends = friendships_graph('brightkite')\n",
+    "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
    "G_brighkite_friends.name = 'Brightkite Friendship Graph'\n",
    "\n",
-    "G_gowalla_friends = friendships_graph('gowalla')\n",
+    "for graph in checkins_graphs:\n",
-    "G_gowalla_friends.name = 'Gowalla Friendship Graph'"
+    "    degree_distribution(graph, log=True)"
   ]
  },
  {
@ -287,42 +358,308 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Now that we have our graphs, let's have a look at some basic information about them"
+    "--- \n",
    "\n",
    "Let's see how does it changes for the friendship networks"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
-   "metadata": {},
+   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
+   "source": [
-     "name": "stdout",
+    "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
-     "output_type": "stream",
+    "\n",
-     "text": [
+    "for graph in friendships_graph:\n",
-      "Brightkite Friendship Graph\n",
+    "    degree_distribution(graph, log=True)"
-      "Number of nodes:  58228\n",
+   ]
-      "Number of edges:  214078\n",
+  },
-      "\n",
+  {
-      "Gowalla Friendship Graph\n",
+   "attachments": {},
-      "Number of nodes:  196591\n",
+   "cell_type": "markdown",
-      "Number of edges:  950327\n",
+   "metadata": {},
-      "\n"
+   "source": [
-     ]
+    "We may be curious to see if the whole friendship network has a different degree distribution than the filtered one. Let's see if there are any differences"
-    }
+   ]
-   ],
+  },
-   "source": [
+  {
-    "for G in [G_brighkite_friends, G_gowalla_friends]:\n",
+   "cell_type": "code",
-    "    print(G.name)\n",
+   "execution_count": null,
-    "    print('Number of nodes: ', G.number_of_nodes())\n",
+   "metadata": {},
-    "    print('Number of edges: ', G.number_of_edges())\n",
+   "outputs": [],
-    "    print()"
+   "source": [
    "G1 = nx.read_edgelist('data/brightkite/brightkite_friends_edges.txt', nodetype=int)\n",
    "G1.name = 'Brightkite Friendship Graph'\n",
    "G2 = nx.read_edgelist('data/gowalla/gowalla_friends_edges.txt', nodetype=int)\n",
    "G2.name = 'Gowalla Friendship Graph'\n",
    "G3 = nx.read_edgelist('data/foursquare/foursquare_friends_edges.txt', nodetype=int)\n",
    "G3.name = 'Foursquare Friendship Graph'\n",
    "\n",
    "degree_distribution(G1, log=True)\n",
    "degree_distribution(G2, log=True)\n",
    "degree_distribution(G3, log=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we can see, there are no notable differences, and this is not surprising. We where only taking into considerations some edge cases. Maybe in siberia this was a very popular social network, but since it's a very harsh environment, being friends on the social network was it's not synonymous of visiting the same places together (where do you go in siberia?). "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "--- \n",
    "\n",
    "Now, we can compute the average degree for each checkins graph and for the friendship graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a dataframe with the average degree for each graph\n",
    "average_degree = pd.DataFrame(columns=['Graph', 'Average Degree'], index=None)\n",
    "\n",
    "for graph in tqdm(checkins_graphs):\n",
    "    average_degree = average_degree.append({'Graph': graph.name, 'Average Degree': np.mean(list(dict(graph.degree()).values()))}, ignore_index=True)\n",
    "\n",
    "for graph in tqdm(friendships_graph):\n",
    "    average_degree = average_degree.append({'Graph': graph.name, 'Average Degree': np.mean(list(dict(graph.degree()).values()))}, ignore_index=True)\n",
    "\n",
    "print(average_degree)  "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clustering coefficient\n",
    "<!-- \n",
    "The clustering coefficient is usually related to a community represented by local structures. The usual definition of clustering (sometimes also referred to as transitivity) is related to the number of triangles in the network. The clustering is high if two nodes sharing a neighbor have a high probability of being connected to each other. There are two common definitions of clustering. The first is global,\n",
    "\n",
    "\\begin{equation}\n",
    "    C = \\frac{3 \\times \\text{the number of triangles in the network}}{\\text{the number of connected triples of vertices}}\n",
    "\\end{equation}\n",
    "\n",
    "where a “connected triple” means a single vertex with edges running to an unordered\n",
    "pair of other vertices. \n",
    "\n",
    "A second definition of clustering is based on the average of the clustering for single nodes. The clustering for a single node is the fraction of pairs of its linked neighbors out of the total number of pairs of its neighbors:\n",
    "\n",
    "\\begin{equation}\n",
    "    C_i = \\frac{\\text{the number of triangles connected to vertex }i}{\\text{the number of triples centered on vertex } i}\n",
    "\\end{equation}\n",
    "\n",
    "For vertices with degree $0$ or $1$, for which both numerator and denominator are zero, we use $C_i = 0$. Then the clustering coefficient for the whole network is the average\n",
    "\n",
    "\\begin{equation}\n",
    "    C = \\frac{1}{n} \\sum_{i} C_i\n",
    "\\end{equation}\n",
    "\n",
    "In both cases the clustering is in the range $0 \\leq C \\leq 1$. \n",
    "\n",
    "In random graph models such as the ER model and the configuration model, the clustering coefficient is low and decreases to $0$ as the system size increases. This is also the situation in many growing network models. However, in many real-world networks the clustering coefficient is rather high and remains constant for large network sizes. This observation led to the introduction of the small-world model, which offers a combination of a regular lattice with high clustering and a random graph. \n",
    "\n",
    "### Clustering in a small world network\n",
    "\n",
    "The simplest way to treat clustering analytically in a small-world network is to use the link addition, rather than the rewiring model. In the limit of large network size, $N \\to \\infty$, and for a fixed fraction of shortcuts $\\phi$, it is clear that the probability of forming triangle vanishes as we approach $1/N$, so the contribution of the shortcuts to the clustering is negligible. Therefore, the clustering of a small-world network is determined by its underlying ordered lattice. For example, consider a ring where each node is connected to its $k$ closest neighbors from each side. A node's number of neighbors is therefore $2k$, and thus it has $2k(2k - 1)/2 = k(2k - 1)$ pairs of neighbors. Consider a node, $i$. All of the $k$ nearest nodes on $i$'s left are connected to each other, and the same is true for the nodes on $i$'s right. This amounts to $2k(k - 1)/2 = k(k - 1)$ pairs. Now consider a node located $d$ places to the left of $k$. It is also connected to its $k$ nearest neighbors from each side. Therefore, it will be connected to $k - d$ neighbors on $i$'s right side. The total number of connected neighbor pairs is\n",
    "\n",
    "\\begin{equation}\n",
    "    k(k-1) + \\sum_{d=1}^k (k-d) = k(k-1) + \\frac{k(k-1)}{2} = \\frac{{3}{2}} k (k-1)\n",
    "\\end{equation}\n",
    "\n",
    "\\nd and the clustering coefficient is:\n",
    "\n",
    "\\begin{equation}\n",
    "    C = \\frac{\\frac{3}{2}k(k-1)}{k(2k-1)} =\\frac{3 (k-1)}{2(2k-1)}\n",
    "\\end{equation}\n",
    "\n",
    "For every $k > 1$, this results in a constant larger than $0$, indicating that the clustering of a small-world network does not vanish for large networks. For large values of $k$, the clustering coefficient approaches $3/4$, that is, the clustering is very high. Note that for a regular two-dimensional grid, the  clustering by definition is zero, since no triangles exist. However, it is clear that the grid has a neighborhood structure. -->\n",
    "\n",
    "--- \n",
    "\n",
    "<!-- Let's see if our networks reflect this property. First thing first, we need to compute the clustering coefficient in standard way. \n",
    "\n",
    "In the networkx library we can find an `average_clustering` function that computes the average clustering coefficient of a graph. However, this function is slow (even as suggested by the docs), so it's better to use the `clustering` function that returns a dictionary with the clustering coefficient for each node. We can then compute the average clustering coefficient by averaging the values of the dictionary. -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "graphs_all = checkins_graphs + friendships_graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# It's going to take a while (about 10 minutes). The time complexity is O(n^2) since we are iterating over all the nodes and their neighbors. \n",
    "\n",
    "clustering_results = pd.DataFrame(columns=['Graph', 'Average Clustering Coefficient'], index=None)\n",
    "\n",
    "for graph in friendships_graph:\n",
    "    print(graph.name)\n",
    "    clustering_results = clustering_results.append(\n",
    "        {'Graph': graph.name, \n",
    "        'Number of Nodes': graph.number_of_nodes(),\n",
    "        'Number of Edges': graph.number_of_edges(),\n",
    "        'Average Clustering Coefficient': nx.average_clustering(graph),\n",
    "        'log N': np.log(graph.number_of_nodes()),\n",
    "        'Average Shortest Path Length': mean_shortest_path(graph), \n",
    "        'betweenness centrality': nx.betweenness_centrality(G)}, \n",
    "        ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(clustering_results)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can use our formula to compute the clustering coefficient in a small world network"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average Path Length\n",
    "\n",
    "<!-- Since many networks are not embedded in real space, the geometrical distance between nodes is meaningless. The most important distance measure in such networks is the minimal number of hops (or chemical distance). That is, the distance between two nodes in the network is defined as the number of edges in the shortest path between them. If the edges are assumed to be weighted, the lowest total weight path, called the _optimal path_, may also be used. The usual mathematical definition of the diameter of the network is the length of the path between the farthest nodes in the network.\n",
    "\n",
    "--- \n",
    "\n",
    "We have seen how we can characterize the clustering in a small world network. Now we can see the second important property of small-world networks is their small diameter, i.e., the small distance between nodes in the network. The distance in the underlying lattice behaves as the linear length of the lattice, L. Since $N \\sim L^d$  where $d$ is the lattice dimension, it follows that the distance between nodes behaves as:\n",
    "\n",
    "\\begin{equation}\n",
    "    l \\sim L \\sim N^{1/d}\n",
    "\\end{equation}\n",
    "\n",
    "Therefore, the underlying lattice has a finite dimension, and the distances on it behave as a power law of the number of nodes, i.e., the distance between nodes is large. However, when adding even a small fraction of shortcuts to the network, this behavior changes dramatically. \n",
    "\n",
    "Let's try to deduce the behavior of the average distance between nodes. Consider a small-world network, with dimension d and connecting distance $k$ (i.e., every node is connected to any other node whose distance from it in every linear dimension is at most $k$). Now, consider the nodes reachable from a source node with at most $r$ steps. When $r$ is small, these are just the \\emph{r-th} nearest neighbors of the source in the underlying lattice. We term the set of these neighbors a “patch”. the radius of which is $kr$ , and the number of nodes it contains is approximately $n(r) = (2kr)d$. \n",
    "\n",
    "We now want to find the distance r for which such a patch will contain about one shortcut. This will allow us to consider this patch as if it was a single node in a randomly connected network. Assume that the probability for a single node to have a shortcut is $\\Phi$. To find the length for which approximately one shortcut is encountered, we need to solve for $r$ the following equation: $(2kr)^d \\Phi = 1$. The correlation length $\\xi$ defined as the distance (or linear size of a patch) for which a shortcut will be encountered with high probability is therefore,\n",
    "\n",
    "\\begin{equation}\n",
    "    \\xi = \\frac{1}{k \\Phi^{1/d}}\n",
    "\\end{equation}\n",
    "\n",
    "Note that we have omitted the factor 2, since we are interested in the order of magnitude. Let us denote by $V(r)$ the total number of nodes reachable from a node by at most $r$ steps, and by $a(r)$, the number of nodes added to a patch in the \\emph{r-th} step. That is, $a(r) = n(r) - n(r-1)$. Thus,\n",
    "\n",
    "\\begin{equation}\n",
    "    a(r) \\sim \\frac{\\text{d} n(r)}{\\text{d} r} = 2kd(2kr)^{d-1}\n",
    "\\end{equation}\n",
    "\n",
    "When a shortcut is encountered at the r step from a node, it leads to a new patch \\footnote{It may actually lead to an already encountered patch, and two patches may also merge after some steps, but this occurs with negligible probability when $N \\to \\infty$ until most of the network is reachable}. This new patch occurs after $r'$ steps, and therefore the number of nodes reachable from its origin is $V (r - r')$. Thus, we obtain the recursive relation\n",
    "\n",
    "\\begin{equation} \n",
    "    V(r) = \\sum_{r'=0}^r a(r') [1 + \\xi^{-d}V(r-r')]\n",
    "\\end{equation}\n",
    "\n",
    "where the first term stands for the size of the original patch, and the second term is derived from the probability of hitting a shortcut, which is approximately $\\xi -d $ for every new node encountered. To simplify the solution of \\ref{eq:recursion}, it can be approximated by a differential equation. The sum can be approximated by an integral, and then the equation can be differentiated with respect to $r$ . For simplicity, we will concentrate here on the solution for the one-dimensional case, with $k = 1$, where $a(r) = 2$. Thus, one obtains\n",
    "\n",
    "\\begin{equation}\n",
    "    \\frac{\\text{d} V(r)}{\\text{d} r} = 2 [1 + V(r)/\\xi]\n",
    "\\end{equation}\n",
    "\n",
    "the solution of which is:\n",
    "\n",
    "\\begin{equation} \n",
    "    V(r) = \\xi \\left(e^{2r/\\xi} -1\\right)\n",
    "\\end{equation}\n",
    "\n",
    "For $r \\ll \\xi$, the exponent can be expanded in a power series, and one obtains $V(r) \\sim 2r = n(r)$, as expected, since usually no shortcut is encountered. For $r \\ gg \\xi$, $V(r)$. An approximation for the average distance between nodes can be obtained by equating $V(r)$ from \\ref*{eq:V(r)} to the total number of nodes, $V(r) = N$. This results in\n",
    "\n",
    "\\begin{equation} \n",
    "    r \\sim \\frac{\\xi}{2} \\ln \\frac{N}{\\xi}\n",
    "\\end{equation}\n",
    "\n",
    "As apparent from \\ref{eq:average distance}, the average distance in a small-world network behaves as the distance in a random graph with patches of size $\\xi$ behaving as the nodes of the random graph. -->"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Analysis of the structure of the networks"
+    "## Betweenness Centrality\n",
    "\n",
    "<!-- The importance of a node in a network depends on many factors. A website may be important due to its content, a router due to its capacity. Of course, all of these properties depend on the nature\n",
    "of the studied network, and may have very little to do with the graph structure of the network. We are particularly interested in the importance of a node (or a link) due to its topological  function in the network. It is reasonable to assume that the topology of a network may dictate some intrinsic importance for different nodes. One measure of centrality can be the degree of a\n",
    "node. The higher the degree, the more the node is connected, and therefore, the higher is its centrality in the network. However, the degree is not the only factor determining a node's importance \\s\n",
    "\n",
    "One of the most accepted definitions of centrality is based on counting paths going through a node. For each node, i, in the network, the number of “routing” paths to all other nodes (i.e., paths through which data flow) going through i is counted, and this number determines the centrality i. The most common selection is taking only\n",
    "the shortest paths as the routing paths. This leads to the following definition: the \\emph{betweenness centrality} of a node, i, equals the number of shortest paths between all pairs of nodes in the network going through it, i.e.,\n",
    "\n",
    "\\begin{equation} \n",
    "    g(i) = \\sum_{\\{ j,k \\}} g_i (j,k)\n",
    "\\end{equation}\n",
    "\n",
    "where the notation $\\{j, k\\}$ stands for summing each pair once, ignoring the order, and $g_i(j, k)$ equals $1$ if the shortest path between nodes $j$ and $k$ passes through node $i$ and $0$ otherwise. In fact, in networks with no weight (i.e., where all edges have the same length), there might be more than one shortest path. In that case, it is common to take $g_i(j, k) = C_i(j,k)/C(j,k)$, where $C(j,k)$ is the number of shortest paths between $j$ and $k$, and $C_i(j,k)$ is the number of those going through $i$. \\footnote{Several variations of this scheme exist, focusing, in particular, on how to count distinct shortest paths (if several shortest paths share some edges). These differences tend to have a very small statistical influence in random complex networks, where the number of short loops is small. Therefore, we will concentrate on the above definition. Another nuance is whether the source and destination are considered part of the shortest path.\n",
    "\n",
    "The usefulness of the betweenness centrality in identifying bottlenecks and important nodes in the network has led to applications in identifying communities in biological and social networks. -->\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "betweenness_results = pd.DataFrame(columns=['Graph', 'Betweenness Centrality'])\n",
    "\n",
    "for graph in checkins_graphs:\n",
    "    betweenness_results = betweenness_results.append(\n",
    "        {'Graph': graph.name,\n",
    "        'Betweenness Centrality': np.mean(list(nx.betweenness_centrality(graph).values()))}, \n",
    "        ignore_index=True)\n",
    "\n",
    "betweenness_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def small_world_clustering(graph: nx.Graph):\n",
    "    tmp = 0\n",
    "    for node in tqdm(graph.nodes()):\n",
    "        k = len(list(graph.neighbors(node)))\n",
    "        if k >=1:\n",
    "            tmp  += (3*(k-1))/(2*(2*k-1))\n",
    "    return tmp/graph.number_of_nodes()\n",
    "\n",
    "print(\"Clustering coefficient for the Watts-Strogatz graph: \", small_world_clustering(G_ws))\n",
    "\n",
    "print(\"Clustering coefficient for the Brightkite checkins graph: \", small_world_clustering(G_brighkite_checkins))"
   ]
  }
 ],