From 2864228e6a39714b9fe5169b1666dd648cb8e41c Mon Sep 17 00:00:00 2001 From: Luca Lombardo Date: Sun, 18 Dec 2022 23:47:29 +0100 Subject: [PATCH] Every thing works. Download functions still to tune for foursquare --- .gitignore | 1 + main.ipynb | 749 ++++++++++++++++++++++++++++++++++++++++++++++++++ testing.ipynb | 640 +----------------------------------------- utils.py | 270 +++++++++++++++--- 4 files changed, 992 insertions(+), 668 deletions(-) create mode 100644 main.ipynb diff --git a/.gitignore b/.gitignore index f7a1a02..ef7815b 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,4 @@ data/ .vscode/ backup/ sources/ +testing.ipynb diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..ce147ba --- /dev/null +++ b/main.ipynb @@ -0,0 +1,749 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import wget\n", + "import zipfile\n", + "import numpy as np\n", + "import pandas as pd\n", + "import networkx as nx\n", + "import plotly.graph_objects as go\n", + "from utils import *\n", + "from collections import Counter\n", + "from tqdm import tqdm\n", + "import time\n", + "\n", + "# ignore warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import the graphs from the saved files\n", + "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n", + "G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n", + "G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n", + "G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n", + "\n", + "G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n", + "G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n", + "G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n", + "G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Discovering the datasets" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To perform our analysis, we will use the following datasets:\n", + "\n", + "- **Brightkite**\n", + "- **Gowalla**\n", + "- **Foursquare**\n", + "\n", + "We can download the datasets using the function `download_dataset` from the `utils` module. It will download the datasets in the `data` folder, organized in sub-folders in the following way:\n", + "\n", + "```\n", + "data\n", + "├── brightkite\n", + "│ ├── brightkite_checkins.txt\n", + "│ └── brightkite_friends_edges.txt\n", + "├── foursquare\n", + "│ ├── foursquare_checkins.txt\n", + "│ ├── foursquare_friends_edges.txt\n", + "│ └── raw_POIs.txt\n", + "└── gowalla\n", + " ├── gowalla_checkins.txt\n", + " └── gowalla_friends_edges.txt\n", + "```\n", + "\n", + "If any of the datasets is already downloaded, it will not be downloaded again. For further details about the function below, please refer to the `utils` module.\n", + "\n", + "> NOTE: the Stanford servers tends to be slow, so it may take a while to download the datasets. It's gonna take about 5 minutes to download all the datasets.\n", + "\n", + "---\n", + "\n", + "### A deeper look at the datasets\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "download_datasets()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a deeper look at them.\n", + "\n", + "## Brightkite\n", + "\n", + "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. We will work with two different datasets. This is how they look like after being filtered by the `download_dataset` function:\n", + "\n", + "- `data/brightkite/brightkite_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph. Originally there were other columns, such as the time of the checkins. During the filtering, we used this information to extract only the checkins from 2009 and then deleted it. This is why the number of checkins is smaller than the original dataset. \n", + " \n", + "- `data/brightkite/brightkite_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This file it's untouched by the function, it's in the form of a graph edge list." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gowalla\n", + "\n", + "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. As for Brightkite, we will work with two different datasets. This is how they look like after being filtered by the `download_dataset` function:\n", + "\n", + "- `data/gowalla/gowalla_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list. Originally there were other columns, such as the time of the checkins. During the filtering, we used this information to extract only the checkins from 2009 and then deleted it. This is why the number of checkins is smaller than the original dataset. \n", + "\n", + "- `data/gowalla/gowalla_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This file it's untouched by the function, it's in the form of a graph edge list. In the next section when we will build the friendship network, we will only consider the users that have at least one check-in in 2009 to avoid having biases in the analysis." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Foursquare\n", + "\n", + "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 22 months from Apr. 2012 to Jan. 2014) global-scale check-in data collected from Foursquare, and also two snapshots of user social networks before and after the check-in data collection period (see more details in our paper). We will work with three different datasets:\n", + "\n", + "- `data/foursquare/foursquare_checkins.txt`: a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list. This fill will remain untouched by the function but due to its size, in the next sections we will focus on the EU sub-sample and the IT sub-sample. The friendship edge list will be modified accordingly.\n", + "\n", + "- `data/foursquare/foursquare_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This is in the form of a graph edge list. \n", + "\n", + "- `data/foursquare/raw_POIs.txt`: the POIS, a tsv file with 2 columns of location and country ISO. We are going to use this file to create the sub-samples of the dataset.\n", + "\n", + "> **NOTE:** In this case I preferred not to take sub-samples based on time. The reason is that there may be a period of time where the social network was not very popular in some countries, so the analysis may be biased. Instead, I decided to take sub-samples based on the country. In this way I have a more homogeneous dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building the networks" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are going to build the the networks for the three datasets as an undirected graph $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n", + "\n", + "The check-ins files of the three datasets are not in the form of a graph edge list, so we need to manipulate them. Let's have a look at the number of lines of each file (note that gowalla and brightkite are already filtered)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def count_lines_and_unique_elements(file):\n", + " df = pd.read_csv(file, sep='\\t', header=None)\n", + " print('Number of lines: ', len(df))\n", + " print('Number of unique elements: ', len(df[0].unique()))\n", + "\n", + "gowalla_path = os.path.join('data', 'gowalla', 'gowalla_checkins.txt')\n", + "brightkite_path = os.path.join('data', 'brightkite', 'brightkite_checkins.txt')\n", + "foursquare_path = os.path.join('data', 'foursquare', 'foursquare_checkins.txt')\n", + "\n", + "_ = [gowalla_path, brightkite_path, foursquare_path]\n", + "\n", + "for path in _:\n", + " print(path.split(os.sep)[-2])\n", + " count_lines_and_unique_elements(path)\n", + " print()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We would like to build a graph starting from an edge list. To do that, we are going to check, for each venue, all the users that visited it. Then, we will create an edge between each pair of users that visited the same venue (avoiding repetitions). This can be easily done in python, but it's going to be a bit slow (this is why we are considering sub-samples of the datasets). Let's see how to do it.\n", + "\n", + "```python\n", + "# let df be the dataframe [\"user_id\", \"venue_id\"] of the checkins\n", + "\n", + "venues_users = df.groupby(\"venue_id\")[\"user_id\"].apply(set)\n", + "\n", + " for users in venues_users:\n", + " for user1, user2 in combinations(users, 2):\n", + " G.add_edge(user1, user2)\n", + "```\n", + "\n", + "It the `utilis.py` module, we have a function that does exactly this called `create_graph_from_checkins`. It takes as input the name of the dataset and returns a networkx graph object. By default it will also write the edge list to a file in the respective dataset folder. The options are\n", + "\n", + "- `brightkite`\n", + "- `gowalla`\n", + "- `foursquareEU`\n", + "- `foursquareIT`\n", + "\n", + "Let's see how it works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# It takes about 3 minutes to create the all the 4 graphs on a i7-8750H CPU\n", + "\n", + "G_brighkite_checkins = create_graph_from_checkins('brightkite')\n", + "G_brighkite_checkins.name = 'Brightkite Checkins Graph'\n", + "\n", + "G_gowalla_checkins = create_graph_from_checkins('gowalla')\n", + "G_gowalla_checkins.name = 'Gowalla Checkins Graph'\n", + "\n", + "G_foursquareEU_checkins = create_graph_from_checkins('foursquareEU')\n", + "G_foursquareEU_checkins.name = 'Foursquare EU Checkins Graph'\n", + "\n", + "G_foursquareIT_checkins = create_graph_from_checkins('foursquareIT')\n", + "G_foursquareIT_checkins.name = 'Foursquare IT Checkins Graph'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Friendship network\n", + "\n", + "Now we want to create a graph where two users are connected if they are friends in the social network. We are intending the concept of friendship in a \"facebook way\", not a \"twitter way\". Less empirically, the graphs is not going to be directed and the edges are not going to be weighted. A user can't be friend with himself, and can't be friend with a user without the user being friend with him.\n", + "\n", + "Since we filtered the checkins for foursquare and gowalla, we are considering only the users that are also present in the check-ins graph. We can build this graph with the function `create_friendships_graph` in the `utils.py` module. It takes as input the name of the dataset and returns a networkx graph object. By default it will also write the edge list to a file in the respective dataset folder. The options are\n", + "\n", + "- `brightkite`\n", + "- `gowalla`\n", + "- `foursquareEU`\n", + "- `foursquareIT`\n", + "\n", + "> **NOTE:** This functions is implemented without the necessity of the checkins graphs being loaded in memory, it uses the edge list file. This choice was made since someone may want to perform some analysis only on the friendship network and so there is no need to load the checkins graph and waste memory. Furthermore, networkx is tremendously slow when loading a graph from an edge list file (since it's written in pure python), so this choice is also motivated by the speed of the function.\n", + "\n", + "Let's see how it works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "G_brighkite_friends = create_friendships_graph('brightkite')\n", + "print(\"Computation done for Brightkite friendship graph\")\n", + "G_brighkite_friends.name = 'Brightkite Friendship Graph'\n", + "\n", + "\n", + "G_gowalla_friends = create_friendships_graph('gowalla')\n", + "print(\"Computation done for (filtered) Gowalla friendship graph\")\n", + "G_gowalla_friends.name = '(Filtered) Gowalla Friendship Graph'\n", + "\n", + "\n", + "G_foursquareIT_friends = create_friendships_graph('foursquareIT')\n", + "print(\"Computation done for Foursquare IT friendship graph\")\n", + "G_foursquareIT_friends.name = 'Foursquare IT Friendship Graph'\n", + "\n", + "\n", + "G_foursquareEU_friends = create_friendships_graph('foursquareEU')\n", + "print(\"Computation done for Foursquare EU friendship graph\")\n", + "G_foursquareEU_friends.name = 'Foursquare EU Friendship Graph'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have our graphs, let's have a look at some basic information about them" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for G in [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]:\n", + " print(G.name)\n", + " print('Number of nodes: ', G.number_of_nodes())\n", + " print('Number of edges: ', G.number_of_edges())\n", + " print()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Proprieties of the structure of the networks\n", + "" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Introduzione da scrivere\n", + "\n", + "qualcosa\n", + "\n", + "---\n", + "\n", + "To help us visualize the results of our analysis we can create a dataframe and fill it with all the information that we will retrive from our networks in this section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n", + "\n", + "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n", + "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n", + "\n", + "graphs_all = checkins_graphs + friendships_graph\n", + "\n", + "for graph in graphs_all:\n", + " analysis_results = analysis_results.append(\n", + " {'Graph': graph.name, \n", + " 'Number of Nodes': graph.number_of_nodes(), \n", + " 'log N': np.log(graph.number_of_nodes()),\n", + " 'Number of Edges': graph.number_of_edges()}, \n", + " ignore_index=True)\n", + "\n", + "analysis_results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Average Degree\n", + "\n", + "The degree of a node is the number of links connected to it. The average degree alone, is not very useful for our future analysis, so we won't spend much time about it. In the next section we will see that the degree distribution is a much more useful measure.\n", + "\n", + "The degree distribution, $P(k)$, is the fraction of sites having degree $k$. We know from the literature that many real networks do not exhibit a Poisson degree distribution, as predicted in the ER model. In fact, many of them exhibit a distribution with a long, power-law, tail, $P(k) \\sim k^{-\\gamma}$ with some $γ$, usually between $2$ and 3$.\n", + "\n", + "For know, we will just compute the average degree of our networks and add it to the dataframe.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for G in graphs_all:\n", + " avg_deg = np.mean([d for n, d in G.degree()])\n", + " analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Degree'] = avg_deg" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clustering coefficient\n", + "\n", + "The clustering coefficient is usually related to a community represented by local structures. The usual definition of clustering is related to the number of triangles in the network. The clustering is high if two nodes sharing a neighbor have a high probability of being connected to each other. There are two common definitions of clustering. The first is global,\n", + "\n", + "\\begin{equation}\n", + " C = \\frac{3 \\times \\text{the number of triangles in the network}}{\\text{the number of connected triples of vertices}}\n", + "\\end{equation}\n", + "\n", + "where a “connected triple” means a single vertex with edges running to an unordered\n", + "pair of other vertices. \n", + "\n", + "A second definition of clustering is based on the average of the clustering for single nodes. The clustering for a single node is the fraction of pairs of its linked neighbors out of the total number of pairs of its neighbors:\n", + "\n", + "\\begin{equation}\n", + " C_i = \\frac{\\text{the number of triangles connected to vertex }i}{\\text{the number of triples centered on vertex } i}\n", + "\\end{equation}\n", + "\n", + "For vertices with degree $0$ or $1$, for which both numerator and denominator are zero, we use $C_i = 0$. Then the clustering coefficient for the whole network is the average\n", + "\n", + "\\begin{equation}\n", + " C = \\frac{1}{n} \\sum_{i} C_i\n", + "\\end{equation}\n", + "\n", + "In both cases the clustering is in the range $0 \\leq C \\leq 1$. \n", + "\n", + "In random graph models such as the ER model and the configuration model, the clustering coefficient is low and decreases to $0$ as the system size increases. This is also the situation in many growing network models. However, in many real-world networks the clustering coefficient is rather high and remains constant for large network sizes. This observation led to the introduction of the small-world model, which offers a combination of a regular lattice with high clustering and a random graph. \n", + "\n", + "---\n", + "\n", + "As one can imagine by the definition given above, this operation is very expensive. The library `networkx` provides a function to compute the clustering coefficient of a graph. In particular, the function `average_clustering` computes the average clustering coefficient of a graph. Unfortunately, since our dataset (even after sub-sampling) are too big to be processed exactly in decent times.\n", + "\n", + "We can use the `average_clustering` function from the `utils` module to compute the average clustering coefficient on a random sub-sample of the graph. The functions takes as input:\n", + "\n", + "- `G: networkx graph object`: the graph on which we want to compute the average clustering coefficient\n", + "- `k: int (default=None)`: percentage of nodes to remove from the graph. If k is None, the average clustering coefficient of each connected component is computed using all the nodes of the connected component.\n", + "\n", + "And returns:\n", + "\n", + "- `float`: the average clustering coefficient of the graph\n", + "\n", + "Depending on the machine and the time available, we can choose different values for `k`. Lower values will give us a more precise result, but will take longer to compute. On the other hand, higher values will give us a less precise result, but will be faster to compute. I suggest to use `k=0.9` to test very quickly the function, and at least `k=0.6` to get a more precise result.\n", + "\n", + "> Since the checkins graphs are way bigger then the friendship graphs, I created two for loop to compute the average clustering coefficient with different values of `k`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# With k = 0.6 for checkins graphs and k = 0.2 for friendship graphs it takes about 8 minutes to compute the average clustering coefficient of alla the graphs on a i7-8750H CPU. Since we are taking random samplings, this of course depends on the random seed.\n", + "\n", + "for graph in checkins_graphs:\n", + " print(\"\\nComputing average clustering coefficient for the {}...\".format(graph.name))\n", + " start = time.time()\n", + " avg_clustering = average_clustering_coefficient(graph, 0.6)\n", + " end = time.time()\n", + "\n", + " print(\"\\tAverage clustering coefficient: {}\".format(avg_clustering))\n", + " print(\"\\tCPU time: \" + str(round(end-start,1)) + \" seconds\")\n", + " analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Clustering Coefficient'] = avg_clustering\n", + "\n", + "for graph in friendships_graph:\n", + " print(\"\\nComputing average clustering coefficient for the {}...\".format(graph.name))\n", + " start = time.time()\n", + " avg_clustering = average_clustering_coefficient(graph, 0.2)\n", + " end = time.time()\n", + "\n", + " print(\"\\tAverage clustering coefficient: {}\".format(avg_clustering))\n", + " print(\"\\tCPU time: \" + str(round(end-start,1)) + \" seconds\")\n", + " analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Clustering Coefficient'] = avg_clustering" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can use our formula to compute the clustering coefficient in a small world network" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Average Path Length\n", + "\n", + "Since we are considering our networks as _not_ embedded in real space (even if we could theoretically), the geometrical distance between nodes is meaningless. The most important distance measure in such networks is the minimal number of hops (or chemical distance). That is, the distance between two nodes in the network is defined as the number of edges in the shortest path between them. If the edges are assumed to be weighted, the lowest total weight path, called the _optimal path_, may also be used. The usual mathematical definition of the diameter of the network is the length of the path between the farthest nodes in the network.\n", + "\n", + "In the next section, we'll see how to characterize this distance in a small world network. \n", + "\n", + "--- \n", + "\n", + "The `networkx` library provides a function to compute the average shortest path length of a graph. In particular, the function `average_shortest_path_length` computes the average shortest path length of a graph. Unfortunately, as always, there are some limitations. The function can only be applied to connected graphs and since we are taking sub-samples of our datasets, there is a probability that the sub-sample is not connected. Another problem is that this operation is very expensive! The shortest path length is defined as\n", + "\n", + "$$ a = \\sum_{s \\in V} \\sum_{t \\in V} \\frac{d(s,t)}{n(n-1)} $$\n", + "\n", + "Where $V$ is the set of nodes in the graph, $n$ is the number of nodes in the graph, and $d(s,t)$ is the shortest path length between nodes $s$ and $t$. The default (and we are going to use) algorithm to compute the shortest path length is the Dijkstra algorithm. \n", + "\n", + "Since we are interested in the average shortest path length of all our connected components, for each node we need to run the Dijkstra algorithm on all the other nodes. Given the dimensions of our datasets and the slowness of networkx, computing the average shortest path length of the whole graph is not feasible.\n", + "\n", + "To overcome this problem, we can use the `average_shortest_path` function from the `utils` module to compute the average shortest path length on a random sub-sample of the graph. The functions takes as input:\n", + "\n", + "- `G: networkx graph object`: the graph on which we want to compute the average shortest path length\n", + "- `k: int (default=None)`: percentage of nodes to remove from the graph. If k is None, the average shortest path length of each connected component is computed using all the nodes of the connected component.\n", + "\n", + "And returns:\n", + "\n", + "- `float`: the average shortest path length of the graph\n", + "\n", + "The implementation is very straightforward. First we remove a random sub-sample of the nodes from the graph. Then we create a list with all the connected components of the sub-sampled graph with at least 10 nodes and finally we compute the average shortest path length using the networkx function `average_shortest_path_length`. The choice of 10 nodes is arbitrary and based on empirical observations. We do that to avoid creating small communities with a very low average shortest path length that could bias our results.\n", + "\n", + "Depending on the machine and the time available, we can choose different values for `k`. Lower values will give us a more precise result, but will take longer to compute. On the other hand, higher values will give us a less precise result, but will be faster to compute. I suggest to use `k=0.9` to test very quickly the function, and at least `k=0.6` to get a more precise result.\n", + "\n", + "> Since the checkins graphs are way bigger then the friendship graphs, I created two for loop to compute the average clustering coefficient with different values of `k`.\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# With k = 0.6 for checkins graphs and k = 0.2 for friendship graphs it takes about 18 minutes end for all the graphs on a i7-8750H CPU. Since we are taking random samplings, this of course depends on the random seed\n", + "\n", + "for graph in checkins_graphs:\n", + " print(\"\\nComputing average shortest path length for graph: \", graph.name)\n", + "\n", + " start = time.time()\n", + " average_shortest_path_length = average_shortest_path(graph, 0.6)\n", + " end = time.time()\n", + "\n", + " print(\"\\tAverage shortest path length: {}\".format(round(average_shortest_path_length,2)))\n", + " print(\"\\tCPU time: \" + str(round(end-start,1)) + \" seconds\")\n", + "\n", + " \n", + " analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Shortest Path Length'] = average_shortest_path_length\n", + "\n", + "for graph in friendships_graph:\n", + " print(\"\\nComputing average shortest path length for graph: \", graph.name)\n", + "\n", + " start = time.time()\n", + " average_shortest_path_length = average_shortest_path(graph, 0.3)\n", + " end = time.time()\n", + "\n", + " print(\"\\tAverage shortest path length: {}\".format(round(average_shortest_path_length,2)))\n", + " print(\"\\tCPU time: \" + str(round(end-start,1)) + \" seconds\")\n", + "\n", + " \n", + " analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Shortest Path Length'] = average_shortest_path_length" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Betweenness Centrality\n", + "\n", + "The importance of a node in a network depends on many factors. A website may be important due to its content, a router due to its capacity. Of course, all of these properties depend on the nature\n", + "of the studied network, and may have very little to do with the graph structure of the network. We are particularly interested in the importance of a node (or a link) due to its topological function in the network. It is reasonable to assume that the topology of a network may dictate some intrinsic importance for different nodes. One measure of centrality can be the degree of a\n", + "node. The higher the degree, the more the node is connected, and therefore, the higher is its centrality in the network. However, the degree is not the only factor determining a node's importance \\s\n", + "\n", + "One of the most accepted definitions of centrality is based on counting paths going through a node. For each node, i, in the network, the number of “routing” paths to all other nodes (i.e., paths through which data flow) going through i is counted, and this number determines the centrality i. The most common selection is taking only\n", + "the shortest paths as the routing paths. This leads to the following definition: the \\emph{betweenness centrality} of a node, i, equals the number of shortest paths between all pairs of nodes in the network going through it, i.e.,\n", + "\n", + "\\begin{equation} \n", + " g(i) = \\sum_{\\{ j,k \\}} g_i (j,k)\n", + "\\end{equation}\n", + "\n", + "where the notation $\\{j, k\\}$ stands for summing each pair once, ignoring the order, and $g_i(j, k)$ equals $1$ if the shortest path between nodes $j$ and $k$ passes through node $i$ and $0$ otherwise. In fact, in networks with no weight (i.e., where all edges have the same length), there might be more than one shortest path. In that case, it is common to take $g_i(j, k) = C_i(j,k)/C(j,k)$, where $C(j,k)$ is the number of shortest paths between $j$ and $k$, and $C_i(j,k)$ is the number of those going through $i$. \\footnote{Several variations of this scheme exist, focusing, in particular, on how to count distinct shortest paths (if several shortest paths share some edges). These differences tend to have a very small statistical influence in random complex networks, where the number of short loops is small. Therefore, we will concentrate on the above definition. Another nuance is whether the source and destination are considered part of the shortest path.\n", + "\n", + "The usefulness of the betweenness centrality in identifying bottlenecks and important nodes in the network has led to applications in identifying communities in biological and social networks.\n", + "\n", + "--- \n", + "\n", + "Let's see how to compute this centrality measure on our networks. The networkx library has a function that computes the betweenness centrality of all nodes in a network. It is based on the algorithm proposed in the paper\n", + "\n", + "_- Ulrik Brandes, A Faster Algorithm for Betweenness Centrality, Journal of Mathematical Sociology, 25(2):163-177, 2001._\n", + "\n", + "Even if this is a very fast algorithm, it's node enough to run in a reasonable time on large networks. Using the same idea of the previous sections, we can take samplings of our original graph, obtaining an approximate results. Unfortunately, I observed that even with heavy sampling, the time required to run the algorithm is still very high. To avoid using even more heavier samplings (that would bias the results), I decided to use a different approach: parallelization!\n", + "\n", + "In the `utils` module I implemented a function called `betweenness_centrality_parallel`. The function takes as input\n", + "\n", + "- `G: networkx graph object`: the graph on which we want to compute the average shortest path length\n", + "- `processes : int (optional)` The number of processes to use for computation. If `None` (default), processes is set to 1 and the standard betweenness algorithm is used.\n", + "- `k: int (default=None)`: percentage of nodes to remove from the graph. If k is None, the average shortest path length of each connected component is computed using all the nodes of the connected component.\n", + "\n", + "> **Memory Note:** Do not use more then 6 process for big graphs, otherwise the memory will be full. Do it only if you have more at least 32 GB of RAM. For small graphs, you can use more processes.\n", + "\n", + "The implemented functions divide the network in chunk of nodes and compute their contribution to the betweenness centrality of the whole network. Each chunk is computed in parallel, and the results are summed up to obtain the final result. The function returns a dictionary with the betweenness centrality of each node. For more information, see the function code in the `utils` module.\n", + "\n", + "Depending on the machine and the time available, we can choose different values for `k`. Lower values will give us a more precise result, but will take longer to compute. On the other hand, higher values will give us a less precise result, but will be faster to compute. I suggest to use `k=0.9` to test very quickly the function, and at least `k=0.6` to get a more precise result.\n", + "\n", + "> Since the checkins graphs are way bigger then the friendship graphs, I created two for loop to compute the average clustering coefficient with different values of `k`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# it takes about 6 minutes to compute the betweenness centrality for all the graphs with 6 processes with k = 0.7. Change the value of k to speed up the computation (at the cost of accuracy). \n", + "\n", + "for graph in checkins_graphs:\n", + " print(\"\\nComputing the approximate betweenness centrality for the {}...\".format(graph.name))\n", + " start = time.time()\n", + " betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6, k = 0.7).values()))\n", + " end = time.time()\n", + " print(\"\\tBetweenness centrality: {} \".format(betweenness_centrality))\n", + " print(\"\\tCPU time: \" + str(round(end-start,1)) + \" seconds\")\n", + "\n", + " analysis_results.loc[analysis_results['Graph'] == graph.name, 'betweenness centrality'] = betweenness_centrality\n", + "\n", + "for graph in friendships_graph:\n", + " print(\"\\nComputing the approximate betweenness centrality for the {}...\".format(graph.name))\n", + " start = time.time()\n", + " betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6, k = 0.3).values()))\n", + " end = time.time()\n", + " print(\"\\tBetweenness centrality: {} \".format(betweenness_centrality))\n", + " print(\"\\tCPU time: \" + str(round(end-start,1)) + \" seconds\")\n", + "\n", + " analysis_results.loc[analysis_results['Graph'] == graph.name, 'betweenness centrality'] = betweenness_centrality\n", + " \n", + "analysis_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save the results as pandas dataframe object\n", + "analysis_results.to_pickle('analysis_results.pkl')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Small-Worldness" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/testing.ipynb b/testing.ipynb index a38229b..169e55c 100644 --- a/testing.ipynb +++ b/testing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -26,646 +26,28 @@ "warnings.filterwarnings(\"ignore\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Discovering the datasets" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To perform our analysis, we will use the following datasets:\n", - "\n", - "- **Brightkite**\n", - "- **Gowalla**\n", - "- **Foursquare**\n", - "\n", - "We can download the datasets using the function `download_dataset` from the `utils` module. It will download the datasets in the `data` folder, organized in sub-folders in the following way:\n", - "\n", - "```\n", - "data\n", - "├── brightkite\n", - "│ ├── brightkite_checkins.txt\n", - "│ └── brightkite_friends_edges.txt\n", - "├── foursquare\n", - "│ ├── foursquare_checkins.txt\n", - "│ ├── foursquare_friends_edges.txt\n", - "│ └── raw_POIs.txt\n", - "└── gowalla\n", - " ├── gowalla_checkins.txt\n", - " └── gowalla_friends_edges.txt\n", - "```\n", - "\n", - "If any of the datasets is already downloaded, it will not be downloaded again. For further details about the function below, please refer to the `utils` module.\n", - "\n", - "> NOTE: the Stanford servers tends to be slow, so it may take a while to download the datasets. It's gonna take about 5 minutes to download all the datasets.\n", - "\n", - "---\n", - "\n", - "### A deeper look at the datasets\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "download_datasets()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's have a deeper look at them.\n", - "\n", - "## Brightkite\n", - "\n", - "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. We will work with two different datasets. This is how they look like after being filtered by the `download_dataset` function:\n", - "\n", - "- `data/brightkite/brightkite_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This file it's untouched by the function, it's in the form of a graph edge list.\n", - "\n", - "\n", - "- `data/brightkite/brightkite_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph. Originally there were other columns, but we will not use them." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Gowalla\n", - "\n", - "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. As for Brightkite, we will work with two different datasets. This is how they look like after being filtered by the `download_dataset` function:\n", - "\n", - "- `data/gowalla/gowalla_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list. Originally there were other columns, such as the time of the checkins. During the filtering, we used this information to extract only the checkins from 2009 and then deleted it. This is why the number of checkins is smaller than the original dataset. \n", - "\n", - "- `data/gowalla/gowalla_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This file it's untouched by the function, it's in the form of a graph edge list. In the next section when we will build the friendship network, we will only consider the users that have at least one check-in in 2009 to avoid having biases in the analysis." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Foursquare\n", - "\n", - "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 22 months from Apr. 2012 to Jan. 2014) global-scale check-in data collected from Foursquare, and also two snapshots of user social networks before and after the check-in data collection period (see more details in our paper). We will work with three different datasets:\n", - "\n", - "- `data/foursquare/foursquare_checkins.txt`: a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list. This fill will remain untouched by the function but due to its size, in the next sections we will focus on the EU sub-sample and the IT sub-sample. The friendship edge list will be modified accordingly.\n", - "\n", - "- `data/foursquare/foursquare_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This is in the form of a graph edge list. \n", - "\n", - "- `data/foursquare/raw_POIs.txt`: the POIS, a tsv file with 2 columns of location and country ISO. We are going to use this file to create the sub-samples of the dataset.\n", - "\n", - "> **NOTE:** In this case I preferred not to take sub-samples based on time. The reason is that there may be a period of time where the social network was not very popular in some countries, so the analysis may be biased. Instead, I decided to take sub-samples based on the country. In this way I have a more homogeneous dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Building the networks" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are asked to construct the networks for the three datasets as an undirected graph $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n", - "\n", - "The check-ins files of the three datasets are not in the form of a graph edge list, so we need to manipulate them. Let's have a look at the number of lines of each file (note that gowalla is already filtered, only 2009 data are present)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def count_lines_and_unique_elements(file):\n", - " df = pd.read_csv(file, sep='\\t', header=None)\n", - " print('Number of lines: ', len(df))\n", - " print('Number of unique elements: ', len(df[0].unique()))\n", - "\n", - "gowalla_path = os.path.join('data', 'gowalla', 'gowalla_checkins.txt')\n", - "brightkite_path = os.path.join('data', 'brightkite', 'brightkite_checkins.txt')\n", - "foursquare_path = os.path.join('data', 'foursquare', 'foursquare_checkins.txt')\n", - "\n", - "_ = [gowalla_path, brightkite_path, foursquare_path]\n", - "\n", - "for path in _:\n", - " print(path.split(os.sep)[-2])\n", - " count_lines_and_unique_elements(path)\n", - " print()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We would like to build a graph starting from an edge list. To do that, we are going to check, for each venue, all the users that visited it. Then, we will create an edge between each pair of users that visited the same venue (avoid repetitions). This can be easily done in python, but it's going to be a bit slow (this is why we are considering sub-samples of the datasets). Let's see how to do it.\n", - "\n", - "```python\n", - "# let df be the dataframe [\"user_id\", \"venue_id\"] of the checkins\n", - "\n", - "venues_users = df.groupby(\"venue_id\")[\"user_id\"].apply(set)\n", - "\n", - " for users in venues_users:\n", - " for user1, user2 in combinations(users, 2):\n", - " G.add_edge(user1, user2)\n", - "```\n", - "\n", - "It the `utilis.py` module, we have a function that does exactly this called `create_graph_from_checkins`. It takes as input the name of the dataset and returns a networkx graph object. By default it will also write the edge list to a file in the respective dataset folder. The options are\n", - "\n", - "- `brightkite`\n", - "- `gowalla`\n", - "- `foursquareEU`\n", - "- `foursquareIT`\n", - "\n", - "Let's see how it works:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# It takes about 4 minutes to create the all the 4 graphs on a i7-8750H CPU\n", - "\n", - "G_brighkite_checkins = create_graph_from_checkins('brightkite')\n", - "G_brighkite_checkins.name = 'Brightkite Checkins Graph'\n", - "\n", - "G_gowalla_checkins = create_graph_from_checkins('gowalla')\n", - "G_gowalla_checkins.name = 'Gowalla Checkins Graph'\n", - "\n", - "G_foursquareEU_checkins = create_graph_from_checkins('foursquareEU')\n", - "G_foursquareEU_checkins.name = 'Foursquare EU Checkins Graph'\n", - "\n", - "G_foursquareIT_checkins = create_graph_from_checkins('foursquareIT')\n", - "G_foursquareIT_checkins.name = 'Foursquare IT Checkins Graph'" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Friendship network\n", - "\n", - "Now we want to create a graph where two users are connected if they are friends in the social network. We are intending the concept of friendship in a \"facebook way\", not a \"twitter way\". Less empirically, the graphs is not going to be directed and the edges are not going to be weighted. A user can't be friend with himself, and can't be friend with a user without the user being friend with him.\n", - "\n", - "Since we filtered the checkins for foursquare and gowalla, we are considering only the users that are also present in the check-ins graph. We can build this graph with the function `create_friendships_graph` in the `utils.py` module. It takes as input the name of the dataset and returns a networkx graph object. By default it will also write the edge list to a file in the respective dataset folder. The options are\n", - "\n", - "- `brightkite`\n", - "- `gowalla`\n", - "- `foursquareEU`\n", - "- `foursquareIT`\n", - "\n", - "> **NOTE:** This functions is implemented without the necessity of the checkins graphs being loaded in memory, it uses the edge list file. This choice was made since someone may want to perform some analysis only on the friendship network and so there is no need to load the checkins graph and waste memory. Furthermore, networkx is tremendously slow when loading a graph from an edge list file (since it's written in pure python), so this choice is also motivated by the speed of the function.\n", - "\n", - "Let's see how it works:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "G_brighkite_friends = create_friendships_graph('brightkite')\n", - "print(\"Computation done for Brightkite friendship graph\")\n", - "G_brighkite_friends.name = 'Brightkite Friendship Graph'\n", - "\n", - "G_gowalla_friends = create_friendships_graph('gowalla')\n", - "print(\"Computation done for (filtered) Gowalla friendship graph\")\n", - "G_gowalla_friends.name = '(Filtered) Gowalla Friendship Graph'\n", - "\n", - "G_foursquareIT_friends = create_friendships_graph('foursquareIT')\n", - "print(\"Computation done for Foursquare IT friendship graph\")\n", - "G_foursquareIT_friends.name = 'Foursquare IT Friendship Graph'\n", - "\n", - "G_foursquareEU_friends = create_friendships_graph('foursquareEU')\n", - "print(\"Computation done for Foursquare EU friendship graph\")\n", - "G_foursquareEU_friends.name = 'Foursquare EU Friendship Graph'\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have our graphs, let's have a look at some basic information about them" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for G in [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]:\n", - " print(G.name)\n", - " print('Number of nodes: ', G.number_of_nodes())\n", - " print('Number of edges: ', G.number_of_edges())\n", - " print()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Analysis of the structure of the networks\n", - "" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Degree distribution\n", - "\n", - "" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n", - "\n", - "for graph in checkins_graphs:\n", - " degree_distribution(graph, log=True)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "--- \n", - "\n", - "Let's see how does it changes for the friendship networks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n", - "\n", - "for graph in friendships_graph:\n", - " degree_distribution(graph, log=True)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We may be curious to see if the whole friendship network has a different degree distribution than the filtered one. Let's see if there are any differences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "G1 = nx.read_edgelist('data/brightkite/brightkite_friends_edges.txt', nodetype=int)\n", - "G1.name = 'Brightkite Friendship Graph'\n", - "G2 = nx.read_edgelist('data/gowalla/gowalla_friends_edges.txt', nodetype=int)\n", - "G2.name = 'Gowalla Friendship Graph'\n", - "G3 = nx.read_edgelist('data/foursquare/foursquare_friends_edges.txt', nodetype=int)\n", - "G3.name = 'Foursquare Friendship Graph'\n", - "\n", - "degree_distribution(G1, log=True)\n", - "degree_distribution(G2, log=True)\n", - "degree_distribution(G3, log=True)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we can see, there are no notable differences, and this is not surprising. We where only taking into considerations some edge cases. Maybe in siberia this was a very popular social network, but since it's a very harsh environment, being friends on the social network was it's not synonymous of visiting the same places together (where do you go in siberia?). " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "--- \n", - "\n", - "Now, we can compute the average degree for each checkins graph and for the friendship graph" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a dataframe with the average degree for each graph\n", - "average_degree = pd.DataFrame(columns=['Graph', 'Average Degree'], index=None)\n", - "\n", - "for graph in tqdm(checkins_graphs):\n", - " average_degree = average_degree.append({'Graph': graph.name, 'Average Degree': np.mean(list(dict(graph.degree()).values()))}, ignore_index=True)\n", - "\n", - "for graph in tqdm(friendships_graph):\n", - " average_degree = average_degree.append({'Graph': graph.name, 'Average Degree': np.mean(list(dict(graph.degree()).values()))}, ignore_index=True)\n", - "\n", - "print(average_degree) " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clustering coefficient\n", - "\n", - "\n", - "--- \n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n", - "\n", - "graphs_all = checkins_graphs + friendships_graph" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# It's going to take a while (about 10 minutes). The time complexity is O(n^2) since we are iterating over all the nodes and their neighbors. \n", - "\n", - "clustering_results = pd.DataFrame(columns=['Graph', 'Average Clustering Coefficient'], index=None)\n", - "\n", - "for graph in friendships_graph:\n", - " print(graph.name)\n", - " clustering_results = clustering_results.append(\n", - " {'Graph': graph.name, \n", - " 'Number of Nodes': graph.number_of_nodes(),\n", - " 'Number of Edges': graph.number_of_edges(),\n", - " 'Average Clustering Coefficient': nx.average_clustering(graph),\n", - " 'log N': np.log(graph.number_of_nodes()),\n", - " 'Average Shortest Path Length': mean_shortest_path(graph), \n", - " 'betweenness centrality': nx.betweenness_centrality(G)}, \n", - " ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(clustering_results)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can use our formula to compute the clustering coefficient in a small world network" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Average Path Length\n", - "\n", - "" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Betweenness Centrality\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "betweenness_results = pd.DataFrame(columns=['Graph', 'Betweenness Centrality'])\n", - "\n", - "for graph in checkins_graphs:\n", - " betweenness_results = betweenness_results.append(\n", - " {'Graph': graph.name,\n", - " 'Betweenness Centrality': np.mean(list(nx.betweenness_centrality(graph).values()))}, \n", - " ignore_index=True)\n", - "\n", - "betweenness_results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def small_world_clustering(graph: nx.Graph):\n", - " tmp = 0\n", - " for node in tqdm(graph.nodes()):\n", - " k = len(list(graph.neighbors(node)))\n", - " if k >=1:\n", - " tmp += (3*(k-1))/(2*(2*k-1))\n", - " return tmp/graph.number_of_nodes()\n", - "\n", - "print(\"Clustering coefficient for the Watts-Strogatz graph: \", small_world_clustering(G_ws))\n", + "# import the graphs from the saved files\n", + "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n", + "G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n", + "G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n", + "G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n", "\n", - "print(\"Clustering coefficient for the Brightkite checkins graph: \", small_world_clustering(G_brighkite_checkins))" + "G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n", + "G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n", + "G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n", + "G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.6 64-bit", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/utils.py b/utils.py index 1e731d5..9f9b12c 100644 --- a/utils.py +++ b/utils.py @@ -6,7 +6,10 @@ NOTEs: - Why do I use os.path.join and not the "/"? Because it's more portable, it works on every OS, while "/" works only on Linux and Mac. In windows you would have to change all the "/" with "\". With os.path.join you don't have to worry about it and, as always, f*** Microsoft. """ +from multiprocessing import Pool +import itertools import os +import random import wget import zipfile import pandas as pd @@ -18,7 +21,7 @@ import plotly.graph_objects as go from collections import Counter import numpy as np import gdown - +from networkx.utils import py_random_state # ------------------------------------------------------------------------# @@ -92,7 +95,9 @@ def download_datasets(): for file in os.listdir(os.path.join("data", "brightkite")): if file.endswith("_totalCheckins.txt"): df = pd.read_csv(os.path.join("data", "brightkite", file), sep="\t", header=None, names=["user_id", "check-in time", "latitude", "longitude", "venue_id"]) - df = df[["user_id", "venue_id"]] + df["check-in time"] = pd.to_datetime(df["check-in time"]) + df = df[df["check-in time"].dt.year == 2010] + df = df.drop(["check-in time", "latitude", "longitude"], axis=1) df.to_csv(os.path.join("data", "brightkite", "brightkite_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8") os.remove(os.path.join("data", "brightkite", file)) @@ -114,9 +119,29 @@ def download_datasets(): # ------------------------------------------------------------------------# - def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT'], create_file = True) -> nx.Graph: + """ + Create a graph from the checkins of the dataset. The graph is undirected and the nodes are the users and the edges are the checkins in common. + + Parameters + ---------- + `dataset` : Literal['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT'] + The dataset to use. + `create_file` : bool, optional + If True, the graph is saved in a file, by default True + + Returns + ------- + `G` : networkx.Graph + + Raises + ------ + ValueError + If the dataset is not valid. + + """ + if dataset not in ['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT']: raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquareEU, foursquareUS, foursquareIT") @@ -166,7 +191,7 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq elif dataset == "foursquareEU": # list of the countries in the EU - EU_countries = ['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'MT', 'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK'] + EU_countries = ['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'MT', 'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK'] venues_array = df_POIS[df_POIS['country code'].isin(EU_countries)]['venue_id'].values @@ -201,7 +226,22 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT']) -> nx.Graph: """ - This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. + Create the graph of friendships for the dataset brightkite, gowalla or foursquare. + The graph is saved in a file. + + Parameters + ---------- + `dataset` : str + The dataset for which we want to create the graph of friendships. + + Returns + ------- + `G` : networkx.Graph + The graph of friendships. + + Notes + ----- + Since we are taking sub-samples of each check-ins dataset, we are also taking sub-samples of the friendship graph. A user is included in the friendship graph if he has at least one check-in in the sub-sample. """ if dataset not in ["brightkite", "gowalla", "foursquareEU", "foursquareIT"]: @@ -231,10 +271,11 @@ def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursqua # create the graph G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph()) + del df_friends_all, df_checkins, df return G - elif dataset == "gowalla": + elif dataset in ["brightkite", "gowalla"]: file = os.path.join("data", dataset, dataset + "_friends_edges.txt") df_friends_all = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"]) @@ -250,22 +291,36 @@ def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursqua df.to_csv(os.path.join("data", dataset, dataset + "_friends_edges_filtered.tsv"), sep="\t", header=False, index=False) G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph()) - return G - - elif dataset == "brightkite": - file = os.path.join("data", dataset, dataset + "_friends_edges.txt") - df_friends_all = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"]) + del df_friends_all, df_checkins, df - G = nx.from_pandas_edgelist(df_friends_all, "node1", "node2", create_using=nx.Graph()) return G +# ------------------------------------------------------------------------# + def degree_distribution(G: nx.Graph, log: bool = True, save: bool = False) -> None: """ - This function takes in input a networkx graph and as options: - - log = True/False (default = True) - - save = True/False (default = False) - The functions plots, using the plotly library, the degree distribution of the graph. If log = True, the plot is in log-log scale. If save = True, the plot is saved in the folder "plots" with the name "degree_distribution_{}.png" where {} is the name of the graph in input. + This function takes in input a networkx graph object and plots the degree distribution of the graph. + + Parameters + ---------- + `G` : networkx graph object + The graph object + + `log` : bool, optional + If True, the plot will be in log-log scale, by default True + + `save` : bool, optional + If True, the plot will be saved in the folder "plots", by default False + + Returns + ------- + None + + Notes + ----- + Due to the characteristics of datasets, not using a log log scale will lead to a un-useful plot. Even if using a log scales alters the power-law distribution, it is still clearly visible and distinguishable from a poisson distribution (witch is what we are interested in in this case) + """ degrees = [G.degree(n) for n in G.nodes()] @@ -303,45 +358,182 @@ def degree_distribution(G: nx.Graph, log: bool = True, save: bool = False) -> No # ------------------------------------------------------------------------# -def average_clustering(G: nx.Graph) -> float: +def chunks(l, n): + """ + Auxiliary function to divide a list of nodes `l` in `n` chunks - """ - This function takes in input a networkx graph and returns the average clustering coefficient of the graph. - """ + Parameters + ---------- + `l` : list + List of nodes - sum_clustering = 0 - for node in G.nodes(): - sum_clustering += nx.clustering(G, node) + `n` : int + Number of chunks - return sum_clustering / G.number_of_nodes() + """ + + l_c = iter(l) + while 1: + x = tuple(itertools.islice(l_c, n)) + if not x: + return + yield x # ------------------------------------------------------------------------# -def watts_strogatz_model(G: nx.Graph, p = 0.1) -> nx.Graph: +def betweenness_centrality_parallel(G, processes=None, k =None) -> dict: + """ + Compute the betweenness centrality for nodes in a graph using multiprocessing. - """ - This function takes in input a networkx graph and a probability p and returns a new graph obtained by applying the Watts-Strogatz model to the input graph. + Parameters + ---------- + G : graph + A networkx graph - It computes k as the average degree of the input graph. - """ + processes : int, optional + The number of processes to use for computation. + If `None`, then it sets processes = 1 + + k : int, optional + Percent of nodes to sample. If `None`, then all nodes are used. + + seed : int, optional + Seed for random number generator (default=None). + + Returns + ------- + dict - k = int(round(np.mean(list(dict(G.degree()).values())))) + Notes + ----- + Do not use more then 6 process for big graphs, otherwise the memory will be full. Do it only if you have more at least 32 GB of RAM. For small graphs, you can use more processes. - G_new = nx.watts_strogatz_graph(G.number_of_nodes(), k, p) - G_new = nx.Graph(G_new) - G_new.name = "watts_strogatz_{}_{}_{}" .format(G.name, p, k) + """ + + # if process is None or 1, run the standard algorithm with one process + if processes is None or processes == 1: + print("\tRunning the networkx approximated algorithm with just one process") + G_copy = G.copy() + sample = int((k)*G_copy.number_of_nodes()) + print("\tNumber of nodes after removing {} % of nodes: {}" .format((k)*100, G_copy.number_of_nodes())) + return np.mean(nx.betweenness_centrality(G, k=sample, seed=42).values()) - return G_new + if processes > os.cpu_count(): + raise ValueError("The number of processes must be less than the number of cores in the system.") -def mean_shortest_path(G: nx.Graph) -> float: + if k is not None: + if (k < 0 or k > 1): + raise ValueError("k must be between 0 and 1.") + else: + G_copy = G.copy() + G_copy.remove_nodes_from(random.sample(G_copy.nodes(), int((k)*G_copy.number_of_nodes()))) + print("\tNumber of nodes after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_nodes())) + print("\tNumber of edges after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_edges())) + + p = Pool(processes=processes) + node_divisor = len(p._pool) * 4 + node_chunks = list(chunks(G_copy.nodes(), G_copy.order() // node_divisor)) + num_chunks = len(node_chunks) + bt_sc = p.starmap( + nx.betweenness_centrality_subset, + zip( + [G_copy] * num_chunks, # this returns a list of Gs + node_chunks, + [list(G_copy)] * num_chunks, # this returns a list of lists of nodes + [True] * num_chunks, + [None] * num_chunks, + ), + ) + + # Reduce the partial solutions + bt_c = bt_sc[0] + for bt in bt_sc[1:]: + for n in bt: + bt_c[n] += bt[n] + + return bt_c + +# ------------------------------------------------------------------------# + +def average_shortest_path(G: nx.Graph, k=None) -> float: """ This function takes in input a networkx graph and returns the average shortest path length of the graph. This works also for disconnected graphs. + + Parameters + ---------- + `G` : networkx graph + The graph to compute the average shortest path length of. + `k` : int + percentage of nodes to remove from the graph. If k is None, the average shortest path length of each connected component is computed using all the nodes of the connected component. + + Returns + ------- + float + The average shortest path length of the graph. + + Raises + ------ + ValueError + If k is not between 0 and 1 """ + if k is not None and (k < 0 or k > 1): + raise ValueError("k must be between 0 and 1") + elif k is None: + connected_components = list(nx.connected_components(G)) + else: + G_copy = G.copy() + # remove the k% of nodes from G + G_copy.remove_nodes_from(random.sample(G_copy.nodes(), int((k)*G_copy.number_of_nodes()))) + print("\tNumber of nodes after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_nodes())) + print("\tNumber of edges after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_edges())) + tmp = 0 - connected_components = list(nx.connected_components(G)) - for C in (G.subgraph(c).copy() for c in connected_components): - tmp += (nx.average_shortest_path_length(C, method='dijkstra')) + connected_components = list(nx.connected_components(G_copy)) + # remove all the connected components with less than 10 nodes + connected_components = [c for c in connected_components if len(c) > 10] + + print("\tNumber of connected components with more then 10 nodes: {}" .format(len(connected_components)), "\r") + for C in (G_copy.subgraph(c).copy() for c in connected_components): + print("\tComputing average shortest path length of connected component with {} nodes and {} edges" .format(C.number_of_nodes(), C.number_of_edges()), "\r", end="") + tmp += nx.average_shortest_path_length(C) + + return np.mean(tmp) + +# ------------------------------------------------------------------------# + +def average_clustering_coefficient(G: nx.Graph, k=None) -> float: + + """ + This function takes in input a networkx graph and returns the average clustering coefficient of the graph. This works also for disconnected graphs. + + Parameters + ---------- + G : networkx graph + The graph to compute the average clustering coefficient of. + k : int + percentage of nodes to remove from the graph. If k is None, the average clustering coefficient of each connected component is computed using all the nodes of the connected component. + + Returns + ------- + float + The average clustering coefficient of the graph. + + Raises + ------ + ValueError + If k is not between 0 and 1 + """ + + if k is not None and (k < 0 or k > 1): + raise ValueError("k must be between 0 and 1") + + elif k is None: + return nx.average_clustering(G) - return tmp/len(list(connected_components)) + else: + G_copy = G.copy() + G_copy.remove_nodes_from(random.sample(list(G_copy.nodes()), int((k)*G_copy.number_of_nodes()))) + print("\tNumber of nodes after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_nodes())) + return nx.average_clustering(G_copy)