From a4c7a9f468855d6e01c4201d2c4ebdb5c5e2c074 Mon Sep 17 00:00:00 2001 From: Luca Lombardo Date: Thu, 23 Feb 2023 23:51:42 +0100 Subject: [PATCH] last changes and small fixes --- main.ipynb | 1665 +++++++++++++++++++++++++++------------------------- utils.py | 82 +-- 2 files changed, 894 insertions(+), 853 deletions(-) diff --git a/main.ipynb b/main.ipynb index e7ad565..3111935 100644 --- a/main.ipynb +++ b/main.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -100,6 +100,12 @@ "source": [ "# [Introduction: theoretical background](#toc0_)\n", "\n", + "Network science aims to build models that reproduce the properties of real networks. Most networks we encounter do not have the comforting regularity of a crystal lattice or the predictable radial architecture of a spider web. \n", + "\n", + "Rather, at first inspection they look as if they were spun randomly. Random network theory embraces this apparent randomness by constructing and characterizing networks that are truly random.\n", + "\n", + "From a modeling perspective a network is a relatively simple object, consisting of only nodes and links. The real challenge, however, is to decide where to place the links between the nodes so that we reproduce the complexity of a real system.\n", + "\n", "Prior to the 1960s, graph theory primarily focused on the characteristics of individual graphs. In the 1960s, Paul Erdős and Alfred Rényi introduced a systematic approach to study random graphs, which involves analyzing a collection, or ensemble, of many different graphs. \n", "\n", "Each graph in the ensemble is assigned a probability, and a property is said to hold with probability $P$ if the total probability of the graphs in the ensemble possessing that property is $P$, or if the fraction of graphs in the ensemble with the property is $P$. This method allows for the application of probability theory in conjunction with discrete math to study ensembles of graphs. A property is considered to hold for a class of graphs if the fraction of graphs in the ensemble without the property has zero measure, which is typically referred to as being true for \"almost every\" graph in the ensemble. `[2]`" @@ -119,7 +125,7 @@ "Let's take a different approach and use some probability theory. We can consider the formation of a random graph as a stochastic process. This process is defined as follows: at time $t=1$, we pick $N$ edges out of the $\\binom{n}{2}$ potential connections between $V_1, V_2, \\dots, V_n$, all with the same probability of being chosen; we designate this edge as $e_1$. At time $t=2$, we choose one of the $\\binom{n}{2}-1$ remaining possibilities, excluding $e_1$, each with equal probability. This process continues at time $t=k+1$ where we choose one of the remaining $\\binom{n}{2} - k$ possibilities, excluding $e_1, e_2, \\dots, e_k$, each with probability $\\frac{1}{\\binom{n}{2} - k}$. The graph obtained by selecting $N$ edges in this manner is denoted as $\\Gamma_{n,N}$.\n", "\n", "\n", - "> NOTE: the two definitions are equivalent, but the second one is more convenient for the study of the properties of random graphs. According to this interpretation we may study the evolution of random graphs, i.e. the step-by-step unraveling of the structure of the graph when $N$ increases. This will be an essential point in our study of the properties of small-worldness." + "> NOTE: the two definitions are equivalent, but the second one is more convenient for the study of the properties of random graphs. According to this interpretation we may study the evolution of random graphs, i.e. the step-by-step unraveling of the structure of the graph when $N$ increases." ] }, { @@ -779,7 +785,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -794,7 +800,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 84831/84831 [00:00<00:00, 310564.31it/s]\n" + "100%|██████████| 84831/84831 [00:00<00:00, 269549.94it/s]\n" ] }, { @@ -810,7 +816,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 31095/31095 [00:00<00:00, 331735.68it/s]\n" + "100%|██████████| 31095/31095 [00:00<00:00, 346067.81it/s]\n" ] }, { @@ -826,7 +832,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 40650/40650 [00:00<00:00, 147409.04it/s]\n" + "100%|██████████| 40650/40650 [00:00<00:00, 148202.44it/s]\n" ] }, { @@ -864,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -900,7 +906,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -959,7 +965,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -971,7 +977,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1047,7 +1053,7 @@ " 3\n", " Brightkite Friendship Graph\n", " 1500\n", - " 1072\n", + " 1167\n", " NaN\n", " NaN\n", " 7.313220\n", @@ -1059,7 +1065,7 @@ " 4\n", " Gowalla Friendship Graph\n", " 1500\n", - " 2309\n", + " 2318\n", " NaN\n", " NaN\n", " 7.313220\n", @@ -1088,8 +1094,8 @@ "0 Brightkite Checkins Graph 6493 292973 NaN \n", "1 Gowalla Checkins Graph 3073 62790 NaN \n", "2 Foursquare Checkins Graph 2324 246702 NaN \n", - "3 Brightkite Friendship Graph 1500 1072 NaN \n", - "4 Gowalla Friendship Graph 1500 2309 NaN \n", + "3 Brightkite Friendship Graph 1500 1167 NaN \n", + "4 Gowalla Friendship Graph 1500 2318 NaN \n", "5 Foursquare Friendship Graph 1397 5323 NaN \n", "\n", " Average Clustering Coefficient log N Average Shortest Path Length \\\n", @@ -1109,7 +1115,7 @@ "5 NaN NaN " ] }, - "execution_count": 24, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1139,7 +1145,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1186,12 +1192,12 @@ " \n", " 3\n", " Brightkite Friendship Graph\n", - " 1.429333\n", + " 1.556\n", " \n", " \n", " 4\n", " Gowalla Friendship Graph\n", - " 3.078667\n", + " 3.090667\n", " \n", " \n", " 5\n", @@ -1207,12 +1213,12 @@ "0 Brightkite Checkins Graph 90.242723\n", "1 Gowalla Checkins Graph 40.865604\n", "2 Foursquare Checkins Graph 212.30809\n", - "3 Brightkite Friendship Graph 1.429333\n", - "4 Gowalla Friendship Graph 3.078667\n", + "3 Brightkite Friendship Graph 1.556\n", + "4 Gowalla Friendship Graph 3.090667\n", "5 Foursquare Friendship Graph 7.620616" ] }, - "execution_count": 25, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1258,7 +1264,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1276,14 +1282,14 @@ "\n", "Computing average clustering coefficient for the Foursquare Checkins Graph...\n", "\tAverage clustering coefficient: 0.6527297407924693\n", - "\tCPU time: 17.5 seconds\n", + "\tCPU time: 17.4 seconds\n", "\n", "Computing average clustering coefficient for the Brightkite Friendship Graph...\n", - "\tAverage clustering coefficient: 0.07238126177648738\n", + "\tAverage clustering coefficient: 0.09809245678251205\n", "\tCPU time: 0.0 seconds\n", "\n", "Computing average clustering coefficient for the Gowalla Friendship Graph...\n", - "\tAverage clustering coefficient: 0.15971676222947884\n", + "\tAverage clustering coefficient: 0.17256509266979794\n", "\tCPU time: 0.0 seconds\n", "\n", "Computing average clustering coefficient for the Foursquare Friendship Graph...\n", @@ -1335,12 +1341,12 @@ " \n", " 3\n", " Brightkite Friendship Graph\n", - " 0.072381\n", + " 0.098092\n", " \n", " \n", " 4\n", " Gowalla Friendship Graph\n", - " 0.159717\n", + " 0.172565\n", " \n", " \n", " 5\n", @@ -1356,12 +1362,12 @@ "0 Brightkite Checkins Graph 0.713999\n", "1 Gowalla Checkins Graph 0.548372\n", "2 Foursquare Checkins Graph 0.65273\n", - "3 Brightkite Friendship Graph 0.072381\n", - "4 Gowalla Friendship Graph 0.159717\n", + "3 Brightkite Friendship Graph 0.098092\n", + "4 Gowalla Friendship Graph 0.172565\n", "5 Foursquare Friendship Graph 0.183485" ] }, - "execution_count": 26, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1450,7 +1456,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1460,44 +1466,44 @@ "\n", "Computing average shortest path length for graph: Brightkite Checkins Graph\n", "\tNumber of nodes after removing 50.0% of nodes: 3247\n", - "\tNumber of edges after removing 50.0% of nodes: 72733\n", + "\tNumber of edges after removing 50.0% of nodes: 76445\n", "\tNumber of connected components with more then 10 nodes: 1 \n", - "\tAverage shortest path length: 3.12ngth of connected component with 2925 nodes and 72656 edges \n", + "\tAverage shortest path length: 3.12ngth of connected component with 2934 nodes and 76386 edges \n", "\tCPU time: 14.9 seconds\n", "\n", "Computing average shortest path length for graph: Gowalla Checkins Graph\n", "\tNumber of nodes after removing 50.0% of nodes: 1537\n", - "\tNumber of edges after removing 50.0% of nodes: 15177\n", + "\tNumber of edges after removing 50.0% of nodes: 15534\n", "\tNumber of connected components with more then 10 nodes: 1 \n", - "\tAverage shortest path length: 3.73ngth of connected component with 1351 nodes and 15146 edges \n", + "\tAverage shortest path length: 3.77ngth of connected component with 1346 nodes and 15509 edges \n", "\tCPU time: 2.3 seconds\n", "\n", "Computing average shortest path length for graph: Foursquare Checkins Graph\n", "\tNumber of nodes after removing 50.0% of nodes: 1162\n", - "\tNumber of edges after removing 50.0% of nodes: 58703\n", + "\tNumber of edges after removing 50.0% of nodes: 64206\n", "\tNumber of connected components with more then 10 nodes: 1 \n", - "\tAverage shortest path length: 2.21ngth of connected component with 1109 nodes and 58698 edges \n", - "\tCPU time: 3.7 seconds\n", + "\tAverage shortest path length: 2.19ngth of connected component with 1110 nodes and 64202 edges \n", + "\tCPU time: 3.9 seconds\n", "\n", "Computing average shortest path length for graph: Brightkite Friendship Graph\n", "\tNumber of nodes after removing 50.0% of nodes: 750\n", - "\tNumber of edges after removing 50.0% of nodes: 212\n", - "\tNumber of connected components with more then 10 nodes: 3 \n", - "\tAverage shortest path length: 11.69gth of connected component with 24 nodes and 27 edges \n", + "\tNumber of edges after removing 50.0% of nodes: 308\n", + "\tNumber of connected components with more then 10 nodes: 4 \n", + "\tAverage shortest path length: 12.46gth of connected component with 11 nodes and 14 edges s \n", "\tCPU time: 0.0 seconds\n", "\n", "Computing average shortest path length for graph: Gowalla Friendship Graph\n", "\tNumber of nodes after removing 50.0% of nodes: 750\n", - "\tNumber of edges after removing 50.0% of nodes: 580\n", - "\tNumber of connected components with more then 10 nodes: 3 \n", - "\tAverage shortest path length: 11.87gth of connected component with 11 nodes and 11 edges s \n", - "\tCPU time: 0.0 seconds\n", + "\tNumber of edges after removing 50.0% of nodes: 526\n", + "\tNumber of connected components with more then 10 nodes: 2 \n", + "\tAverage shortest path length: 8.22ngth of connected component with 11 nodes and 15 edges s \n", + "\tCPU time: 0.1 seconds\n", "\n", "Computing average shortest path length for graph: Foursquare Friendship Graph\n", "\tNumber of nodes after removing 50.0% of nodes: 699\n", - "\tNumber of edges after removing 50.0% of nodes: 1636\n", + "\tNumber of edges after removing 50.0% of nodes: 1332\n", "\tNumber of connected components with more then 10 nodes: 1 \n", - "\tAverage shortest path length: 3.74ngth of connected component with 468 nodes and 1563 edges \n", + "\tAverage shortest path length: 4.17ngth of connected component with 441 nodes and 1265 edges \n", "\tCPU time: 0.2 seconds\n" ] }, @@ -1530,32 +1536,32 @@ " \n", " 0\n", " Brightkite Checkins Graph\n", - " 3.117784\n", + " 3.117627\n", " \n", " \n", " 1\n", " Gowalla Checkins Graph\n", - " 3.726131\n", + " 3.767501\n", " \n", " \n", " 2\n", " Foursquare Checkins Graph\n", - " 2.213298\n", + " 2.187581\n", " \n", " \n", " 3\n", " Brightkite Friendship Graph\n", - " 11.694492\n", + " 12.464348\n", " \n", " \n", " 4\n", " Gowalla Friendship Graph\n", - " 11.865207\n", + " 8.218571\n", " \n", " \n", " 5\n", " Foursquare Friendship Graph\n", - " 3.739097\n", + " 4.170243\n", " \n", " \n", "\n", @@ -1563,15 +1569,15 @@ ], "text/plain": [ " Graph Average Shortest Path Length\n", - "0 Brightkite Checkins Graph 3.117784\n", - "1 Gowalla Checkins Graph 3.726131\n", - "2 Foursquare Checkins Graph 2.213298\n", - "3 Brightkite Friendship Graph 11.694492\n", - "4 Gowalla Friendship Graph 11.865207\n", - "5 Foursquare Friendship Graph 3.739097" + "0 Brightkite Checkins Graph 3.117627\n", + "1 Gowalla Checkins Graph 3.767501\n", + "2 Foursquare Checkins Graph 2.187581\n", + "3 Brightkite Friendship Graph 12.464348\n", + "4 Gowalla Friendship Graph 8.218571\n", + "5 Foursquare Friendship Graph 4.170243" ] }, - "execution_count": 27, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1617,7 +1623,25 @@ "\n", "However, the computation of this algorithm on large networks may not be feasible within a reasonable time frame due to the computational cost. To mitigate this issue we can use a sampling approach can be employed. Nevertheless, even with heavy sampling, the computation time remains prohibitively high. To avoid further sampling, which would introduce bias, we will use a parallelization approach to speed up the computation.\n", "\n", - "In the `utils` module, I have implemented a function called `betweenness_centrality_parallel` that uses this approach. The function takes as input a networkx graph object, the number of processes to use for computation (default is 1, which uses the standard betweenness algorithm), and the percentage of nodes to remove from the graph (default is `None`, which uses all nodes of the connected component to compute the average shortest path length). The function divides the network into _chunks_ of nodes and computes their contribution to the betweenness centrality of the whole network in parallel, ultimately returning a dictionary of the betweenness centrality of each node.\n", + "In the `utils` module, I have implemented a function called `betweenness_centrality_parallel` that uses this approach. The function takes three optional arguments: `processes`, `k`, and `seed`. The function returns a dictionary with nodes as keys and their betweenness centrality as values.\n", + "\n", + "Here's a detailed explanation of how the function works:\n", + "\n", + "* Check if the `processes` argument is `None` or `1`. If it is, the function runs the standard networkx betweenness centrality algorithm with just one process. Otherwise, it proceeds to the next step.\n", + "\n", + "* Check if the processes argument is greater than the number of CPU cores in the system. If it is, the function raises a `ValueError`. Otherwise, it proceeds to the next step.\n", + "\n", + "* Check if the `k` argument is None. If it is, the function makes a copy of the original graph and proceeds to the next step. Otherwise, it removes a random subset of nodes from the graph according to the value of `k`.\n", + "\n", + "* Create a `pool` of processes `processes` using the Python multiprocessing module.\n", + "\n", + "* Divide the nodes in the graph into `node_divisor` chunks, where `node_divisor` is four times the number of processes. This is to ensure that each process has an equal amount of work to do.\n", + "\n", + "* Run the `betweenness_centrality_subset()` function from the networkx library on each chunk of nodes using the `starmap()` method of the `Pool` object. The `betweenness_centrality_subset()` function calculates the betweenness centrality of nodes in a subset of the graph.\n", + "\n", + "* Reduce the partial solutions obtained from each process by summing up the betweenness centralities of each node in the final dictionary.\n", + "\n", + "* Return the dictionary with nodes as keys and their betweenness centrality as values.\n", "\n", "Please note that for large graphs, it is advisable to not use more than 6 processes to avoid memory constraints. The number of processes to use can be determined based on the available time and the machine being used. For small graphs, more processes may be used. As for the percentage of nodes to remove, lower values provide more precise results but take longer to compute, while higher values result in less precise results but are faster to compute. It is suggested to start with `k=0.6` for a quick test and use `k=0.2` for a more precise result. For more information, refer to the function code in the `utils` module." ] @@ -1633,52 +1657,40 @@ "text": [ "\n", "Computing the approximate betweenness centrality for the Brightkite Checkins Graph...\n", - "\n", - "Graph is not connected. Taking the largest connected component\n", - "Number of nodes in the sampled graph: 3895\n", - "Number of edges in the sampled graph: 109595\n", - "\tBetweenness centrality: 0.000571670929493879 \n", - "\tCPU time: 98.5 seconds\n", + "\tNumber of nodes after removing 50.0% of nodes: 3247\n", + "\tNumber of edges after removing 50.0% of nodes: 74427\n", + "\tBetweenness centrality: 0.0005509672600241676 \n", + "\tCPU time: 15.4 seconds\n", "\n", "Computing the approximate betweenness centrality for the Gowalla Checkins Graph...\n", - "\n", - "Graph is not connected. Taking the largest connected component\n", - "Number of nodes in the sampled graph: 1843\n", - "Number of edges in the sampled graph: 23885\n", - "\tBetweenness centrality: 0.0015210855257160798 \n", - "\tCPU time: 12.7 seconds\n", + "\tNumber of nodes after removing 50.0% of nodes: 1537\n", + "\tNumber of edges after removing 50.0% of nodes: 15547\n", + "\tBetweenness centrality: 0.0013088595012846352 \n", + "\tCPU time: 2.5 seconds\n", "\n", "Computing the approximate betweenness centrality for the Foursquare Checkins Graph...\n", - "\n", - "Graph is not connected. Taking the largest connected component\n", - "Number of nodes in the sampled graph: 1394\n", - "Number of edges in the sampled graph: 86059\n", - "\tBetweenness centrality: 0.0009135778105161979 \n", - "\tCPU time: 30.7 seconds\n", + "\tNumber of nodes after removing 50.0% of nodes: 1162\n", + "\tNumber of edges after removing 50.0% of nodes: 61418\n", + "\tBetweenness centrality: 0.0009345550376554907 \n", + "\tCPU time: 5.0 seconds\n", "\n", "Computing the approximate betweenness centrality for the Brightkite Friendship Graph...\n", - "\n", - "Graph is not connected. Taking the largest connected component\n", - "Number of nodes in the sampled graph: 900\n", - "Number of edges in the sampled graph: 381\n", - "\tBetweenness centrality: 0.024814375935463612 \n", + "\tNumber of nodes after removing 50.0% of nodes: 750\n", + "\tNumber of edges after removing 50.0% of nodes: 259\n", + "\tBetweenness centrality: 2.5155346760624384e-05 \n", "\tCPU time: 0.3 seconds\n", "\n", "Computing the approximate betweenness centrality for the Gowalla Friendship Graph...\n", - "\n", - "Graph is not connected. Taking the largest connected component\n", - "Number of nodes in the sampled graph: 900\n", - "Number of edges in the sampled graph: 797\n", - "\tBetweenness centrality: 0.014602979710672098 \n", + "\tNumber of nodes after removing 50.0% of nodes: 750\n", + "\tNumber of edges after removing 50.0% of nodes: 563\n", + "\tBetweenness centrality: 0.00125608238197573 \n", "\tCPU time: 0.4 seconds\n", "\n", "Computing the approximate betweenness centrality for the Foursquare Friendship Graph...\n", - "\n", - "Graph is not connected. Taking the largest connected component\n", - "Number of nodes in the sampled graph: 838\n", - "Number of edges in the sampled graph: 1860\n", - "\tBetweenness centrality: 0.00518979036959232 \n", - "\tCPU time: 0.8 seconds\n" + "\tNumber of nodes after removing 50.0% of nodes: 699\n", + "\tNumber of edges after removing 50.0% of nodes: 1292\n", + "\tBetweenness centrality: 0.0015709401669892229 \n", + "\tCPU time: 0.4 seconds\n" ] }, { @@ -1710,32 +1722,32 @@ " \n", " 0\n", " Brightkite Checkins Graph\n", - " 0.000572\n", + " 0.000551\n", " \n", " \n", " 1\n", " Gowalla Checkins Graph\n", - " 0.001521\n", + " 0.001309\n", " \n", " \n", " 2\n", " Foursquare Checkins Graph\n", - " 0.000914\n", + " 0.000935\n", " \n", " \n", " 3\n", " Brightkite Friendship Graph\n", - " 0.024814\n", + " 0.000025\n", " \n", " \n", " 4\n", " Gowalla Friendship Graph\n", - " 0.014603\n", + " 0.001256\n", " \n", " \n", " 5\n", " Foursquare Friendship Graph\n", - " 0.00519\n", + " 0.001571\n", " \n", " \n", "\n", @@ -1743,12 +1755,12 @@ ], "text/plain": [ " Graph betweenness centrality\n", - "0 Brightkite Checkins Graph 0.000572\n", - "1 Gowalla Checkins Graph 0.001521\n", - "2 Foursquare Checkins Graph 0.000914\n", - "3 Brightkite Friendship Graph 0.024814\n", - "4 Gowalla Friendship Graph 0.014603\n", - "5 Foursquare Friendship Graph 0.00519" + "0 Brightkite Checkins Graph 0.000551\n", + "1 Gowalla Checkins Graph 0.001309\n", + "2 Foursquare Checkins Graph 0.000935\n", + "3 Brightkite Friendship Graph 0.000025\n", + "4 Gowalla Friendship Graph 0.001256\n", + "5 Foursquare Friendship Graph 0.001571" ] }, "execution_count": 20, @@ -1760,7 +1772,7 @@ "for graph in graphs_all:\n", " print(\"\\nComputing the approximate betweenness centrality for the {}...\".format(graph.name))\n", " start = time.time()\n", - " betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6, k = 0.4).values()))\n", + " betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 8, k = 0.5).values()))\n", " end = time.time()\n", " print(\"\\tBetweenness centrality: {} \".format(betweenness_centrality))\n", " print(\"\\tCPU time: \" + str(round(end-start,1)) + \" seconds\")\n", @@ -1781,7 +1793,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1912,7 +1924,7 @@ "5 0.001803 " ] }, - "execution_count": 33, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1953,7 +1965,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -3878,9 +3890,9 @@ } }, "text/html": [ - "