Added tex relation 'cause I'm masochist

2 years ago · fc5eb04fc4
parent 83feae8fb9
commit fc5eb04fc4
7 changed files with 651 additions and 78 deletions
--- a/.gitignore
+++ b/.gitignore
@ -129,5 +129,16 @@ dmypy.json
 .pyre/

 # Data folder
-
 data/
+
+# Latex stuff
+
+*.aux
+*.bbl
+*.blg
+*.fdb_latexmk
+*.fls
+*.log
+*.out
+*.synctex.gz
+*.toc
--- a/main.py
+++ b/main.py
@ -5,6 +5,17 @@ import wget
 import zipfile
 import networkx as nx
 import pandas as pd
+from typing import Literal
+
+
+"""
+NOTEs:
+
+- This file is note meant to be run, it's just a collection of functions that are used in the other files. It's just a way to keep the code clean and organized.
+
+- Why do I use os.path.join and not the "/"? Because it's more portable, it works on every OS, while "/" works only on Linux and Mac. If you want to use it on Windows, you have to change all the "/" with "\". With os.path.join you don't have to worry about it and, as always, f*** Microsoft.
+"""
+

 def download_datasets():

@ -55,7 +66,7 @@ def download_datasets():
    Then, if there is no dataset_tsmc2014 folder, it unzips the file. Then move all the .txt files inside the dataset_tsmc2014 folder in the foursquare folder. Then delete the dataset_tsmc2014 folder and the .zip file.
    """

-    for file in os.listdir("data/foursquare"):
+    for file in os.listdir(os.path.join("data", "foursquare")):
        if file.endswith(".zip"):
            if os.path.exists(os.path.join("data", "foursquare", "dataset_tsmc2014")):
                if len(os.listdir(os.path.join("data", "foursquare"))) == 3:
@ -84,7 +95,7 @@ def download_datasets():
            if file.endswith(".gz"):
                os.system("gunzip {}".format(os.path.join("data", "gowalla", file)))

-def create_graph(dataset):
+def create_graph(dataset: Literal['brightkite', 'gowalla']) -> nx.Graph:

    """
    This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. It uses pandas to read the file since it's faster than the standard python open() function. If we don't want to use the standard python open() function, the following code works as well:
@ -97,36 +108,38 @@ def create_graph(dataset):

    """

-    if dataset == "brightkite":
-        file = os.path.join("data", "brightkite", "loc-brightkite_edges.txt")
-    elif dataset == "gowalla":
-        file = os.path.join("data", "gowalla", "loc-gowalla_edges.txt")
-    else:
+    if dataset not in ["brightkite", "gowalla"]:
        raise ValueError("The dataset must be brightkite or gowalla. If you want to use the foursquare dataset, use the create_foursquare_graph() function")

+    file = os.path.join("data", dataset, "loc-{}_edges.txt".format(dataset))

    df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
    G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
    return G


-def create_foursquare_graph(dataset):
-    # we are given a .txt in tsv format, with 8 colums. Read the file with pandas, the first two colums are colles "UserID" and "VenueID", the other 6 are useless. Then create a graph with networkx for this function. The unique users ID are the nodes, two nodes are linked, if they have been in the same venue at least once. The weight of the edge is the number of times they have been in the same venue.
+def create_foursquare_graph(dataset: Literal['NYC', 'TKY'])-> nx.Graph:
+
+    """
+    This function takes in input a tsv file with 8 columns, each line in the file is a check-in. The function returns an undirected networkx graph object.
+
+    Differently from the function create_graph used for the brightkite and gowalla dataset, we are not given a list of edges, so we can't use the function nx.from_pandas_edgelist. We have to create the graph manually.
+
+    Firstly, we retrive the unique user ID using the set() data structure: this are the nodes of our graph. Since we don't want to work with adjacency matrices due to their O(mn) space complexity (even tho, we could memorize them in a compressed way thanks to their sparsity propriety), we use an adjacency list representation of the graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues.
+    """
+
+    if dataset not in ["NYC", "TKY"]:
+        raise ValueError("The dataset must be NYC or TKY")

-    if dataset == "NYC":
-        file = os.path.join("data", "foursquare", "dataset_TSMC2014_NYC.txt")
-    elif dataset == "TKY":
-        file = os.path.join("data", "foursquare", "dataset_TSMC2014_TKY.txt")
+    file = os.path.join("data", "foursquare", "dataset_TSMC2014_{}.txt".format(dataset))

-    df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "Timezone offset in minutes", "UTC time"])
+    df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "Timezone offset in minutes", "UTC time"], encoding="utf-8", encoding_errors="ignore")

-    # use the set() data structure to get the unique users ID
-    users = set(df["UserID"])
+    users = set(df["UserID"]) # get the unique users ID
    G = nx.Graph()
    G.add_nodes_from(users)

-    # create a dictionary with the users ID as keys and the venues ID as values
-    users_venues = {}
+    users_venues = {} # key: user ID, value: set of venues ID
    for user in users:
        users_venues[user] = set(df[df["UserID"] == user]["VenueID"])

--- a/ref.bib
+++ b/ref.bib
@ -0,0 +1,11 @@
+@article{yang2014modeling,
+	author={Yang, Dingqi and Zhang, Daqing and Zheng, Vincent. W. and Yu, Zhiyong},
+	journal={IEEE Transactions on Systems, Man, and Cybernetics: Systems},
+	title={Modeling User Activity Preference by Leveraging User Spatial Temporal Characteristics in LBSNs},
+	year={2015},
+	volume={45},
+	number={1},
+	pages={129--142},
+	ISSN={2168-2216},
+	publisher={IEEE}
+}
--- a/testing.ipynb
+++ b/testing.ipynb
@ -63,8 +63,7 @@
     "text": [
      "The brightkite dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n",
      "The gowalla dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n",
-      "Downloading foursquare dataset...\n",
-      "Download completed of foursquare dataset\n"
+      "The foursquare dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n"
     ]
    }
   ],
@ -87,11 +86,106 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user</th>\n",
+       "      <th>check-in time</th>\n",
+       "      <th>latitude</th>\n",
+       "      <th>longitude</th>\n",
+       "      <th>location_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-17T01:48:53Z</td>\n",
+       "      <td>39.747652</td>\n",
+       "      <td>-104.992510</td>\n",
+       "      <td>88c46bf20db295831bd2d1718ad7e6f5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-16T06:02:04Z</td>\n",
+       "      <td>39.891383</td>\n",
+       "      <td>-105.070814</td>\n",
+       "      <td>7a0f88982aa015062b95e3b4843f9ca2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-16T03:48:54Z</td>\n",
+       "      <td>39.891077</td>\n",
+       "      <td>-105.068532</td>\n",
+       "      <td>dd7cd3d264c2d063832db506fba8bf79</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-14T18:25:51Z</td>\n",
+       "      <td>39.750469</td>\n",
+       "      <td>-104.999073</td>\n",
+       "      <td>9848afcc62e500a01cf6fbf24b797732f8963683</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-14T00:21:47Z</td>\n",
+       "      <td>39.752713</td>\n",
+       "      <td>-104.996337</td>\n",
+       "      <td>2ef143e12038c870038df53e0478cefc</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   user         check-in time   latitude   longitude  \\\n",
+       "0     0  2010-10-17T01:48:53Z  39.747652 -104.992510   \n",
+       "1     0  2010-10-16T06:02:04Z  39.891383 -105.070814   \n",
+       "2     0  2010-10-16T03:48:54Z  39.891077 -105.068532   \n",
+       "3     0  2010-10-14T18:25:51Z  39.750469 -104.999073   \n",
+       "4     0  2010-10-14T00:21:47Z  39.752713 -104.996337   \n",
+       "\n",
+       "                                location_id  \n",
+       "0          88c46bf20db295831bd2d1718ad7e6f5  \n",
+       "1          7a0f88982aa015062b95e3b4843f9ca2  \n",
+       "2          dd7cd3d264c2d063832db506fba8bf79  \n",
+       "3  9848afcc62e500a01cf6fbf24b797732f8963683  \n",
+       "4          2ef143e12038c870038df53e0478cefc  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "Brightkite_df = pd.read_csv(\"data/brightkite/loc-brightkite_totalCheckins.txt.gz\", sep=\"\\t\", header=None, compression=\"gzip\", names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
+    "brightkite_path = os.path.join(\"data\", \"brightkite\", \"loc-brightkite_totalCheckins.txt\")\n",
+    "Brightkite_df = pd.read_csv(brightkite_path, sep=\"\\t\", header=None, names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
    "\n",
    "Brightkite_df.head()"
   ]
@ -109,11 +203,100 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user</th>\n",
+       "      <th>check-in time</th>\n",
+       "      <th>latitude</th>\n",
+       "      <th>longitude</th>\n",
+       "      <th>location_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-19T23:55:27Z</td>\n",
+       "      <td>30.235909</td>\n",
+       "      <td>-97.795140</td>\n",
+       "      <td>22847</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-18T22:17:43Z</td>\n",
+       "      <td>30.269103</td>\n",
+       "      <td>-97.749395</td>\n",
+       "      <td>420315</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-17T23:42:03Z</td>\n",
+       "      <td>30.255731</td>\n",
+       "      <td>-97.763386</td>\n",
+       "      <td>316637</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-17T19:26:05Z</td>\n",
+       "      <td>30.263418</td>\n",
+       "      <td>-97.757597</td>\n",
+       "      <td>16516</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2010-10-16T18:50:42Z</td>\n",
+       "      <td>30.274292</td>\n",
+       "      <td>-97.740523</td>\n",
+       "      <td>5535878</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   user         check-in time   latitude  longitude  location_id\n",
+       "0     0  2010-10-19T23:55:27Z  30.235909 -97.795140        22847\n",
+       "1     0  2010-10-18T22:17:43Z  30.269103 -97.749395       420315\n",
+       "2     0  2010-10-17T23:42:03Z  30.255731 -97.763386       316637\n",
+       "3     0  2010-10-17T19:26:05Z  30.263418 -97.757597        16516\n",
+       "4     0  2010-10-16T18:50:42Z  30.274292 -97.740523      5535878"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "Gowalla_df = pd.read_csv(\"data/gowalla/loc-gowalla_totalCheckins.txt.gz\", sep=\"\\t\", header=None, compression=\"gzip\", names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
+    "gowalla_path = os.path.join(\"data\", \"gowalla\", \"loc-gowalla_totalCheckins.txt\")\n",
+    "\n",
+    "Gowalla_df = pd.read_csv(gowalla_path, sep=\"\\t\", header=None, names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
    "\n",
    "Gowalla_df.head() "
   ]
@ -124,24 +307,156 @@
   "source": [
    "## Foursquare\n",
    "\n",
-    "DA RISCRIVERE"
+    "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 10 months) check-in data in New York city and Tokyo collected from Foursquare from 12 April 2012 to 16 February 2013. It contains two files in tsv format. Each file contains 8 columns, which are:\n",
+    "\n",
+    "1. User ID (anonymized)\n",
+    "2. Venue ID (Foursquare)\n",
+    "3. Venue category ID (Foursquare)\n",
+    "4. Venue category name (Foursquare)\n",
+    "5. Latitude\n",
+    "6. Longitude\n",
+    "7. Timezone offset in minutes (The offset in minutes between when this check-in occurred and the same time in UTC)\n",
+    "8. UTC time\n",
+    "\n",
+    "Here is an example of check-in information from the New York dataset:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
+   "outputs": [
    {
-   "cell_type": "markdown",
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>UserID</th>\n",
+       "      <th>VenueID</th>\n",
+       "      <th>CategoryID</th>\n",
+       "      <th>CategoryName</th>\n",
+       "      <th>Latitude</th>\n",
+       "      <th>Longitude</th>\n",
+       "      <th>Timezone offset in minutes</th>\n",
+       "      <th>UTC time</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>470</td>\n",
+       "      <td>49bbd6c0f964a520f4531fe3</td>\n",
+       "      <td>4bf58dd8d48988d127951735</td>\n",
+       "      <td>Arts &amp; Crafts Store</td>\n",
+       "      <td>40.719810</td>\n",
+       "      <td>-74.002581</td>\n",
+       "      <td>-240</td>\n",
+       "      <td>Tue Apr 03 18:00:09 +0000 2012</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>979</td>\n",
+       "      <td>4a43c0aef964a520c6a61fe3</td>\n",
+       "      <td>4bf58dd8d48988d1df941735</td>\n",
+       "      <td>Bridge</td>\n",
+       "      <td>40.606800</td>\n",
+       "      <td>-74.044170</td>\n",
+       "      <td>-240</td>\n",
+       "      <td>Tue Apr 03 18:00:25 +0000 2012</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>69</td>\n",
+       "      <td>4c5cc7b485a1e21e00d35711</td>\n",
+       "      <td>4bf58dd8d48988d103941735</td>\n",
+       "      <td>Home (private)</td>\n",
+       "      <td>40.716162</td>\n",
+       "      <td>-73.883070</td>\n",
+       "      <td>-240</td>\n",
+       "      <td>Tue Apr 03 18:02:24 +0000 2012</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>395</td>\n",
+       "      <td>4bc7086715a7ef3bef9878da</td>\n",
+       "      <td>4bf58dd8d48988d104941735</td>\n",
+       "      <td>Medical Center</td>\n",
+       "      <td>40.745164</td>\n",
+       "      <td>-73.982519</td>\n",
+       "      <td>-240</td>\n",
+       "      <td>Tue Apr 03 18:02:41 +0000 2012</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>87</td>\n",
+       "      <td>4cf2c5321d18a143951b5cec</td>\n",
+       "      <td>4bf58dd8d48988d1cb941735</td>\n",
+       "      <td>Food Truck</td>\n",
+       "      <td>40.740104</td>\n",
+       "      <td>-73.989658</td>\n",
+       "      <td>-240</td>\n",
+       "      <td>Tue Apr 03 18:03:00 +0000 2012</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   UserID                   VenueID                CategoryID  \\\n",
+       "0     470  49bbd6c0f964a520f4531fe3  4bf58dd8d48988d127951735   \n",
+       "1     979  4a43c0aef964a520c6a61fe3  4bf58dd8d48988d1df941735   \n",
+       "2      69  4c5cc7b485a1e21e00d35711  4bf58dd8d48988d103941735   \n",
+       "3     395  4bc7086715a7ef3bef9878da  4bf58dd8d48988d104941735   \n",
+       "4      87  4cf2c5321d18a143951b5cec  4bf58dd8d48988d1cb941735   \n",
+       "\n",
+       "          CategoryName   Latitude  Longitude  Timezone offset in minutes  \\\n",
+       "0  Arts & Crafts Store  40.719810 -74.002581                        -240   \n",
+       "1               Bridge  40.606800 -74.044170                        -240   \n",
+       "2       Home (private)  40.716162 -73.883070                        -240   \n",
+       "3       Medical Center  40.745164 -73.982519                        -240   \n",
+       "4           Food Truck  40.740104 -73.989658                        -240   \n",
+       "\n",
+       "                         UTC time  \n",
+       "0  Tue Apr 03 18:00:09 +0000 2012  \n",
+       "1  Tue Apr 03 18:00:25 +0000 2012  \n",
+       "2  Tue Apr 03 18:02:24 +0000 2012  \n",
+       "3  Tue Apr 03 18:02:41 +0000 2012  \n",
+       "4  Tue Apr 03 18:03:00 +0000 2012  "
+      ]
+     },
+     "execution_count": 5,
     "metadata": {},
-   "source": []
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "foursquare_NYC_path = ny = os.path.join(\"data\", \"foursquare\", \"dataset_TSMC2014_NYC.txt\")\n",
+    "foursquare_TKY_path = ny = os.path.join(\"data\", \"foursquare\", \"dataset_TSMC2014_TKY.txt\")\n",
+    "\n",
+    "foursquare_NYC_df = pd.read_csv(foursquare_NYC_path, sep=\"\\t\", header=None, names=[\"UserID\", \"VenueID\", \"CategoryID\", \"CategoryName\", \"Latitude\", \"Longitude\", \"Timezone offset in minutes\", \"UTC time\"], encoding=\"utf-8\", encoding_errors=\"ignore\")\n",
+    "\n",
+    "foursquare_NYC_df.head()"
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -149,7 +464,7 @@
    "\n",
    "del Brightkite_df\n",
    "del Gowalla_df\n",
-    "del Foursquare_checks_df"
+    "del foursquare_NYC_df"
   ]
  },
  {
@ -165,47 +480,14 @@
   "source": [
    "We are asked to construct the networks for the three datasets as un undirected grah $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n",
    "\n",
-    "We can use the fucntion create_graph from the utils module to create the networks. It takes as input the path to an edge list file and returns a networkx graph object. For further details about the function below, please refer to the `utils` module."
+    "We can use the fucntion create_graph from the `utils` module to create the networks. It takes as input the path to an edge list file and returns a networkx graph object. For further details about the function below, please refer to the `utils` module."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [
-    {
-     "ename": "UnicodeDecodeError",
-     "evalue": "'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._string_convert\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._string_box_utf8\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
-      "\u001b[0;32m/tmp/ipykernel_154187/2796184490.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mBrightkite_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"brightkite\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mGowalla_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"gowalla\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mFoursquare_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_foursquare_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"NYC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m~/github/small-worlds/main.py\u001b[0m in \u001b[0;36mcreate_foursquare_graph\u001b[0;34m(dataset)\u001b[0m\n\u001b[1;32m    119\u001b[0m         \u001b[0mfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"foursquare\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"dataset_TSMC2014_TKY.txt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m     \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"UserID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"VenueID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"CategoryID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"CategoryName\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Latitude\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Longitude\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Timezone offset in minutes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"UTC time\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    123\u001b[0m     \u001b[0;31m# use the set() data structure to get the unique users ID\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    209\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m                     \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    213\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    315\u001b[0m                     \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minspect\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcurrentframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    316\u001b[0m                 )\n\u001b[0;32m--> 317\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    319\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m    948\u001b[0m     \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 950\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    952\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    610\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 611\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    612\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m   1770\u001b[0m                     \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1771\u001b[0m                     \u001b[0mcol_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1772\u001b[0;31m                 \u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m  \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1773\u001b[0m                     \u001b[0mnrows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1774\u001b[0m                 )\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m    241\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    242\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlow_memory\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 243\u001b[0;31m                 \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_low_memory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    244\u001b[0m                 \u001b[0;31m# destructive to chunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    245\u001b[0m                 \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_concatenate_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._string_convert\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._string_box_utf8\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "Brightkite_G = create_graph(\"brightkite\")\n",
    "Gowalla_G = create_graph(\"gowalla\")\n",
@ -221,14 +503,123 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dataset</th>\n",
+       "      <th>nodes</th>\n",
+       "      <th>edges</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>brightkite</td>\n",
+       "      <td>58228</td>\n",
+       "      <td>214078</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>gowalla</td>\n",
+       "      <td>196591</td>\n",
+       "      <td>950327</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>foursquare</td>\n",
+       "      <td>1083</td>\n",
+       "      <td>282405</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      dataset   nodes   edges\n",
+       "0  brightkite   58228  214078\n",
+       "1     gowalla  196591  950327\n",
+       "2  foursquare    1083  282405"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "print(\"Brightkite graph has {} nodes and {} edges\".format(Brightkite_G.number_of_nodes(), Brightkite_G.number_of_edges()))\n",
+    "dataset = [\"brightkite\", \"gowalla\", \"foursquare\"]\n",
+    "nodes = [len(Brightkite_G.nodes()), len(Gowalla_G.nodes()), len(Foursquare_G.nodes())]\n",
+    "edges = [len(Brightkite_G.edges()), len(Gowalla_G.edges()), len(Foursquare_G.edges())]\n",
    "\n",
-    "print(\"Gowalla graph has {} nodes and {} edges\".format(Gowalla_G.number_of_nodes(), Gowalla_G.number_of_edges()))"
+    "df = pd.DataFrame({\"dataset\": dataset, \"nodes\": nodes, \"edges\": edges})\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we can see, the foursquare dataset has a very small number of nodes. Even tho it has 227428 check-ins, the unique users (the nodes) are only 1083. The Tokyo dataset is about 2 times bigger, with 537703 check-ins and 2294 nodes. Since we are in the same order of magnitude, we will focus on the New York dataset, in the style of a classic Hollywood movie about aliens invasions."
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of the structure of the networks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Given a social network, which of its nodes are more central? This question has been asked many times in sociology, psychology and computer science, and a whole plethora of centrality measures (a.k.a. centrality indices, or rankings) were proposed to account for the importance of the nodes of a network. \n",
+    "\n",
+    "These networks, typically generated directly or indirectly by human activity and interaction (and therefore hereafter dubbed social”), appear in a large variety of contexts and often exhibit a surprisingly similar structure. One of the most important notions that researchers have been trying to capture in such networks is “node centrality”: ideally, every node (often representing an individual) has some degree of influence or importance within the social domain under consideration, and one expects such importance to surface in the structure of the social network; centrality is a quantitative measure that aims at\n",
+    "revealing the importance of a node.\n",
+    "\n",
+    "Among the types of centrality that have been considered in the literature, many have to do with distances between nodes. Take, for instance, a node in an undirected connected network: if the sum of distances to all other nodes is large, the node under consideration is peripheral; this is the starting point to define Bavelas’s closeness centrality [cite], which is the reciprocal of peripherality (i.e., the reciprocal of the sum of distances to all other nodes).\n",
+    "\n",
+    "The role played by shortest paths is justified by one of the most well-known features of complex networks, the so-called small-world phenomenon. A small-world network [cite] is a graph where the average distance between nodes is logarithmic in the size of the network, whereas the clustering coefficient is larger (that is, neighborhoods tend to be denser) than in a random Erdős-Rényi graph with the same size and average distance. The fact that social networks (whether electronically mediated or not) exhibit the small-world property is known at least since Milgram’s famous experiment [cite] and is arguably the most popular of all features of complex networks. For instance, the average distance of the Facebook graph was recently established to be just 4.74\n",
+    "\n",
+    "## Definitions and conventions\n",
+    "\n",
+    "From now on, we consider directed graphs defined by a set $N$ of $n$ nodes and $A \\subseteq N \\times N$ of arcs. We write\n",
+    "\n",
+    "For this project, we will focus on the following centrality measures:\n",
+    "\n",
+    "- **Degree centrality**\n",
+    "- **Clustering Coefficient**\n",
+    "- **Average Path Length**\n",
+    "- **Betweenness Centrality**\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
  }
 ],
 "metadata": {
--- a/tex/main.pdf
+++ b/tex/main.pdf
--- a/tex/main.tex
+++ b/tex/main.tex
@ -0,0 +1,91 @@
+\documentclass[12pt]{article}
+\usepackage[margin=1in]{geometry}
+\usepackage[utf8]{inputenc}
+\usepackage[english]{babel}
+\usepackage[T1]{fontenc}
+\usepackage{fourier}
+\usepackage{amsthm}
+\usepackage{amssymb}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+\usepackage{latexsym}
+\usepackage{graphicx}
+\usepackage{float}
+\usepackage{etoolbox}
+\usepackage{hyperref}
+\usepackage{tikz}
+\usepackage{lipsum}
+\usepackage{algorithm}
+\usepackage{algpseudocode}
+
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\Z}{\mathbb{Z}}
+\newcommand{\Q}{\mathbb{Q}}
+\newcommand{\C}{\mathbb{C}}
+\newcommand{\s}{\vspace*{0.4cm}}
+\newcommand{\nd}{\noindent}
+
+% add counters
+
+\title{Spatial networks and small worlds}
+\author{Luca Lombardo}
+\date{Dicember 2022}
+
+\begin{document}
+\maketitle
+
+\begin{abstract}
+    \noindent \lipsum[1]
+\end{abstract}
+
+\tableofcontents
+\clearpage
+
+\section{Introduction}
+Given a social network, which of its nodes are more central? This question has been asked many times in sociology, psychology and computer science, and a whole plethora of centrality measures (a.k.a. centrality indices, or rankings) were proposed to account for the importance of the nodes of a network. \s
+
+\nd These networks, typically generated directly or indirectly by human activity and interaction (and therefore hereafter dubbed social”), appear in a large variety of contexts and often exhibit a surprisingly similar structure. One of the most important notions that researchers have been trying to capture in such networks is “node centrality”: ideally, every node (often representing an individual) has some degree of influence or importance within the social domain under  consideration, and one expects such importance to surface in the structure of the social network; centrality is a quantitative measure that aims at revealing the importance of a node. \s
+
+\nd Among the types of centrality that have been considered in the literature, many have to do with distances between nodes. Take, for instance, a node in an undirected connected network: if the sum of distances to all other nodes is large, the node under consideration is peripheral; this is the starting point to define Bavelas's closeness centrality \cite{closeness}, which is the reciprocal of peripherality (i.e., the reciprocal of the sum of distances to all other nodes). \s
+
+\nd The role played by shortest paths is justified by one of the most well-known features of complex networks, the so-called small-world phenomenon. A small-world network \cite{cohen_havlin_2010} is a graph where the average distance between nodes is logarithmic in the size of the network, whereas the clustering coefficient is larger (that is, neighborhoods tend to be denser) than in a random Erdős-Rényi graph with the same size and average distance. The fact that social networks (whether electronically mediated or not) exhibit the small-world property is known at least since Milgram's famous experiment \cite{} and is arguably the most popular of all features of complex networks. For instance, the average distance of the Facebook graph was recently established to be just $4.74$ \cite{milgram1967small}. \s
+
+\subsection*{Definitions and conventions}
+
+From now on, we consider directed graphs defined by a set $N$ of $n$ nodes and $A \subseteq N \times N$ of arcs. We write $x \to y$ when $(x,y) \in A$ and call $x$ and $y$ the source and the target of the arc, respectively. \s
+\clearpage
+
+\subsection*{Aim of the project}
+
+The Aim of the project is to study the small-world phenomenon in location-based (social) networks. As test cases, we consider three real-world datasets: Brightkite, Gowalla and Foursquare. In the next sections, we will describe the datasets and the methodology we used to extract the networks from them. \s
+
+\nd We are interest in analyzing 4 different centrality measures:
+
+\begin{itemize}
+    \item Distribution of Degree
+    \item Clustering coefficient
+    \item Average Path Length
+    \item Betweenness Centrality
+\end{itemize}
+\clearpage
+
+
+\section{Theoretical background on centrality measures}
+
+Centrality is a fundamental tool in the study of social networks: the first efforts to define formally centrality indices were put forth in the late 1940s by the Group Networks Laboratory at MIT directed by Alex Bavelas \cite{closeness}; those pioneering experiments concluded that centrality was related to group efficiency in problem-solving, and agreed with the subjects' perception of leadership. In the following decades, various measures of centrality were employed in a multitude of contexts. \s
+
+\subsection*{Geometric measures}
+
+We call geometric those measures assuming that importance is a function of distances; more precisely, a geometric centrality depends only on how many nodes exist at every distance. These are some of the oldest measures defined in the literature.
+
+\paragraph*{In-degree centrality} Indegree, the number of incoming arcs $d^-(x)$, can be considered a geometric measure: it is simply the number of nodes at distance one\footnote{Most centrality measures proposed in the literature were actually described only for undirected, connected graphs. Since the study of web graphs and online social networks has posed the problem of extending centrality concepts to networks that are directed, and possibly not strongly connected, in the rest of this paper we consider measures depending on the incoming arcs of a node (e.g., incoming paths, left dominant eigenvectors, distances from all nodes to a fixed node). If necessary, these measures can be called “negative”, as opposed to the “positive” versions obtained by considering outgoing paths, or (equivalently) by transposing the graph.} . It is probably the oldest measure of importance ever used, as it is equivalent to majority voting in elections (where $x \to y$ if $x$ voted for $y$). Indegree has a number of obvious shortcomings (e.g., it is easy to spam), but it is a good baseline. \s
+
+\nd Other notable geometric measures that we will not explore in this project, are \emph{closeness centrality}, (which is the reciprocal of the sum of distances to all other nodes, and betweenness centrality, which is the number of shortest paths that pass through a node), \emph{Lin's index} (which is the sum of the distances to all other nodes), and \emph{Harmonic Centrality} (which is a generalization of the closeness centrality). \s
+
+
+\clearpage
+\bibliographystyle{unsrt}
+\bibliography{ref}
+\nocite{*}
+\end{document}
--- a/tex/ref.bib
+++ b/tex/ref.bib
@ -0,0 +1,56 @@
+@article{yang2014modeling,
+	author={Yang, Dingqi and Zhang, Daqing and Zheng, Vincent. W. and Yu, Zhiyong},
+	journal={IEEE Transactions on Systems, Man, and Cybernetics: Systems},
+	title={Modeling User Activity Preference by Leveraging User Spatial Temporal Characteristics in LBSNs},
+	year={2015},
+	volume={45},
+	number={1},
+	pages={129--142},
+	ISSN={2168-2216},
+	publisher={IEEE}
+}
+
+@article{closeness,
+ ISSN = {00932914},
+ URL = {http://www.jstor.org/stable/44135428},
+ author = {Alex Bavelas},
+ journal = {Applied Anthropology},
+ number = {3},
+ pages = {16--30},
+ publisher = {Society for Applied Anthropology},
+ title = {A MATHEMATICAL MODEL FOR GROUP STRUCTURES},
+ urldate = {2022-12-07},
+ volume = {7},
+ year = {1948}
+}
+
+@book{cohen_havlin_2010, place={Cambridge}, title={Complex Networks: Structure, Robustness and Function}, DOI={10.1017/CBO9780511780356}, publisher={Cambridge University Press}, author={Cohen, Reuven and Havlin, Shlomo}, year={2010}}
+
+@misc{https://doi.org/10.48550/arxiv.1111.4570,
+  doi = {10.48550/ARXIV.1111.4570},
+
+  url = {https://arxiv.org/abs/1111.4570},
+
+  author = {Backstrom, Lars and Boldi, Paolo and Rosa, Marco and Ugander, Johan and Vigna, Sebastiano},
+
+  keywords = {Social and Information Networks (cs.SI), Physics and Society (physics.soc-ph), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Physical sciences, FOS: Physical sciences},
+
+  title = {Four Degrees of Separation},
+
+  publisher = {arXiv},
+
+  year = {2011},
+
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+
+@article{milgram1967small,
+  title={The small world problem},
+  author={Milgram, Stanley},
+  journal={Psychology today},
+  volume={2},
+  number={1},
+  pages={60--67},
+  year={1967},
+  publisher={New York}
+}