diff --git a/.gitignore b/.gitignore index 1756853..57eeefa 100644 --- a/.gitignore +++ b/.gitignore @@ -146,5 +146,4 @@ data/ .vscode/ backup/ sources/ -testing.ipynb extra/ diff --git a/main.ipynb b/main.ipynb index f451d4e..182b50f 100644 --- a/main.ipynb +++ b/main.ipynb @@ -229,7 +229,7 @@ "metadata": {}, "outputs": [], "source": [ - "# download_datasets()" + "download_datasets()" ] }, { @@ -342,7 +342,7 @@ "\n", "gdf_brightkite.plot(marker='o', color='blue', markersize=1)\n", "\n", - "# update the pandas datafram with the new values\n", + "# update the pandas dataframe with the new values\n", "df_brighkite = gdf_brightkite\n", "print(\"Number of unique users in Europe: \", len(df_brighkite['user id'].unique()))\n", "\n", @@ -508,11 +508,11 @@ "\n", "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 22 months from Apr. 2012 to Jan. 2014) global-scale check-in data collected from Foursquare, and also two snapshots of user social networks before and after the check-in data collection period (see more details in our paper). We will work with three different datasets:\n", "\n", - "- `data/foursquare/foursquare_checkins.txt`: a tsv file with 4 columns: `User ID`, `Venue ID`, `UTC time`, `Timezone offset in minutes` \n", + "- `foursquare_checkins.txt`: a tsv file with 4 columns: `User ID`, `Venue ID`, `UTC time`, `Timezone offset in minutes` \n", "\n", - "- `data/foursquare/foursquare_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This is in the form of a graph edge list. \n", + "- `foursquare_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This is in the form of a graph edge list. \n", "\n", - "- `data/foursquare/raw_POIs.txt`: the POIS, a tsv file with 5 columns: `Venue ID`, `Latitude`, `Longitude`, `Venue category name`, `Country code (ISO)`.\n", + "- `raw_POIs.txt`: the POIS, a tsv file with 5 columns: `Venue ID`, `Latitude`, `Longitude`, `Venue category name`, `Country code (ISO)`.\n", "\n", "--- \n", "\n", @@ -10385,12 +10385,13 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "As we can clearly see from the graphs obtained, the degree distribution of the networks is not Poissonian, but rather scale-free. This is a good indication that the networks are not random, but rather small-world.\n", "\n", - "Let's try to plot the distribution degree of a random Erdos-Renyi graph with the same number of nodes and a probability of edge creation equal to the number of edges of the network divided by the number of possible edges. We expect to see a Poissonian distribution.\n", + "Let's try to plot the distribution degree of a random Watts-Strogatz graph with the same number of nodes and a probability of edge creation equal to the number of edges of the network divided by the number of possible edges. We expect to see a Poissonian distribution.\n", "\n", "> This is a time saving approach, NOT a rigorous one. If we want to be rigorous, should follow the algorithm proposed by Maslov and Sneppen, implemented in the the networkx function `random_reference`." ] @@ -13368,17 +13369,13 @@ } ], "source": [ - "# for each network, create a erdos-renyi model of the original. If you want to test it with the watts-strogatz model, uncomment the code below and comment the first 2 lines of the for loop\n", - "\n", "for graph in checkins_graphs:\n", "\n", " p = G.number_of_edges() / (G.number_of_nodes())\n", " avg_degree = int(np.mean([d for n, d in G.degree()]))\n", " G = nx.watts_strogatz_graph(G.number_of_nodes(), avg_degree, p)\n", - " G.name = graph.name + \" Watts-Strogatz\"\n", + " G.name = graph.name + \" - Watts-Strogatz similarity\"\n", "\n", - " # G = nx.erdos_renyi_graph(graph.number_of_nodes(), nx.density(graph))\n", - " # G.name = graph.name + \" Erdos-Renyi\"\n", " print(G.name)\n", " print(\"Number of nodes: \", G.number_of_nodes())\n", " print(\"Number of edges: \", G.number_of_edges())\n", @@ -16363,17 +16360,13 @@ } ], "source": [ - "# for each network, create a erdos-renyi model of the original graph. If you want to test it with the watts-strogatz model, uncomment the code below and comment the first 2 lines of the for loop\n", - "\n", "for graph in friendships_graph:\n", "\n", " p = G.number_of_edges() / (G.number_of_nodes())\n", " avg_degree = int(np.mean([d for n, d in G.degree()]))\n", " G = nx.watts_strogatz_graph(G.number_of_nodes(), avg_degree, p)\n", - " G.name = graph.name + \" Watts-Strogatz\"\n", + " G.name = graph.name + \" - Watts-Strogatz similarity\"\n", "\n", - " # G = nx.erdos_renyi_graph(graph.number_of_nodes(), nx.density(graph))\n", - " # G.name = graph.name + \" Erdos-Renyi\" \n", "\n", " print(G.name)\n", " print(\"Number of nodes: \", G.number_of_nodes())\n", @@ -16397,7 +16390,7 @@ "source": [ "## The Small-World Model\n", "\n", - "It should be clarified that real networks are not random. Their formation and development are dictated by a combination of many different processes and influences. These influencing conditions include natural limitations and processes, human considerations such as optimal performance and robustness, economic considerations, natural selection and many others. Controversies still exist regarding the measure to which random models represent real-world networks. However, in this section we will focus on random network models and attempt to show if their properties may still be used to study properties of our real-world networks. \n", + "Let's start by clarified that real networks are not random. Their formation and development are dictated by a combination of many different processes and influences. These influencing conditions include natural limitations and processes, human considerations such as optimal performance and robustness, economic considerations, natural selection and many others. Controversies still exist regarding the measure to which random models represent real-world networks. However, in this section we will focus on random network models and attempt to show if their properties may still be used to study properties of our real-world networks. \n", "\n", "Many real-world networks have many properties that cannot be explained by the ER model. One such property is the high clustering observed in many real-world networks. This led Watts and Strogatz to develop an alternative model, called the “small-world” model. Quoting their paper:\n", "\n", @@ -16427,7 +16420,7 @@ "\n", "## Identifying small-world networks\n", "\n", - "Small-world networks are distinguished from other networks by two specific properties, the first being high clustering (C) among nodes. High clustering supports specialization as local collections of strongly interconnected nodes readily share information or resources. Conceptually, clustering is quite straightforward to comprehend. In a real-world analogy, clustering represents the probability that one’s friends are also friends of each other. Small-world networks also have short path lengths (L) as is commonly observed in random networks. Path length is a measure of the distance between nodes in the network, calculated as the mean of the shortest geodesic distances between all possible node pairs. Small values of $L$ ensure that information or resources easily spreads throughout the network. This property makes distributed information processing possible on technological networks and supports the six degrees of separation often reported in social networks.\n", + "Small-world networks are distinguished from other networks by two specific properties, the first being high clustering ($C$) among nodes. High clustering supports specialization as local collections of strongly interconnected nodes readily share information or resources. Conceptually, clustering is quite straightforward to comprehend. In a real-world analogy, clustering represents the probability that one’s friends are also friends of each other. Small-world networks also have short path lengths ($L$) as is commonly observed in random networks. The path length is a measure of the distance between nodes in the network, calculated as the mean of the shortest geodesic distances between all possible node pairs. Small values of $L$ ensure that information or resources easily spreads throughout the network. This property makes distributed information processing possible on technological networks and supports the six degrees of separation often reported in social networks.\n", "\n", "Watts and Strogatz developed a network model (WS model) that resulted in the first-ever networks with clustering close to that of a lattice and path lengths similar to those of random networks. The WS model demonstrates that random rewiring of a small percentage of the edges in a lattice results in a precipitous decrease in the path length, but only trivial reductions in the clustering. Across this rewiring probability, there is a range where the discrepancy between clustering and path length is very large, and it is in this area that the benefits of small-world networks are realized.\n", "\n", @@ -16452,69 +16445,10 @@ "\n", "#### Limitations\n", "\n", - "The length of time it takes to generate lattice networks, particularly for large networks.Although\n", - "latticization is fast in smaller networks, large networks such as functional brain networks and the Internet can take several\n", - "hours to generate and optimize. The latticization procedure described here uses an algorithm developed by Sporns and\n", + "The length of time it takes to generate lattice networks, particularly for large networks. Although latticization is fast in smaller networks, large networks such as functional brain networks and the Internet can take several days to generate and optimize. The latticization procedure described here uses an algorithm developed by Sporns and\n", "Zwi in 2004, but the algorithm was used on much smaller datasets. \n", "\n", - "Furthermore, $\\omega$ is limited by networks that have very low clustering that cannot be appreciably increased, such as networks with ‘‘super hubs’’ or hierarchical networks. In hierarchical networks, the nodes are often configured in branches\n", - "that contain little to no clustering. In networks with ‘‘super hubs,’’ the network may contain a hub that has a node with\n", - "a degree that is several times in magnitude greater than the next most connected hub. In both these networks, there are\n", - "fewer configurations to increase the clustering of the network. Moreover, in a targeted assault of these networks, the topology is easily destroyed (Albert et al., 2000). Such vulnerability to attack signifies a network that may not be small-world." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_omega(graph):\n", - " if not nx.is_connected(graph):\n", - " tmp = max(nx.connected_components(graph), key=len)\n", - " graph = graph.subgraph(tmp)\n", - " # omega = nx.omega(graph, niter=2, nrand=2)\n", - " C = analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Clustering Coefficient'].values[0]\n", - " print(\"Average clustering coefficient for the original graph: \", C)\n", - " L = analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Shortest Path Length'].values[0]\n", - " print(\"Average shortest path length for the original graph: \", L)\n", - "\n", - " Cr = 0\n", - " Lr = 0\n", - " for i in range(2):\n", - " print(\"\\nIteration: \", i)\n", - " G_rand = nx.random_reference(graph, niter=1, connectivity=True, seed=42)\n", - " print(\"\\tRandom graph created\")\n", - " Cr += nx.average_clustering(G_rand)\n", - " print(\"\\tAverage clustering coefficient computed: \", Cr)\n", - "\n", - " G_latt = nx.lattice_reference(graph, niter=1, seed=42)\n", - " print(\"\\tLattice graph created\")\n", - " Lr += average_shortest_path(G_latt)\n", - " print(\"\\tAverage shortest path computed: \", Lr)\n", - " \n", - " Cr = Cr/2\n", - " Lr = Lr/2\n", - " omega = C/Cr - L/Lr\n", - "\n", - " print(\"Omega coefficient for graph {}: {}\".format(graph.name, omega))\n", - " return (graph.name, omega)\n", - "\n", - "# Create a dataframe to store the results\n", - "omegas = pd.DataFrame(columns=['Graph', 'Omega Coefficient'])\n", - "\n", - "# Set the number of processes to 4\n", - "with multiprocessing.Pool(3) as p:\n", - " # Map the compute_omega function to the list of input graph objects\n", - " results = p.map(compute_omega, checkins_graphs)\n", - " print(results)\n", - "\n", - "# Store the results in the dataframe\n", - "for result in results:\n", - " omegas.loc[omegas['Graph'] == result[0], 'Omega Coefficient'] = result[1]\n", - "\n", - "omegas\n", - "omegas.to_pickle('omegas.pkl')" + "Furthermore, $\\omega$ is limited by networks that have very low clustering that cannot be appreciably increased, such as networks with 'super hubs' or hierarchical networks. In hierarchical networks, the nodes are often configured in branches that contain little to no clustering. In networks with ‘‘super hubs,’’ the network may contain a hub that has a node with a degree that is several times in magnitude greater than the next most connected hub. In both these networks, there are fewer configurations to increase the clustering of the network. Moreover, in a targeted assault of these networks, the topology is easily destroyed (Albert et al., 2000). Such vulnerability to attack signifies a network that may not be small-world." ] } ], diff --git a/testing.ipynb b/testing.ipynb index f9b9c15..a4d4859 100644 --- a/testing.ipynb +++ b/testing.ipynb @@ -22,202 +22,12 @@ "import time\n", "import geopandas as gpd\n", "import gdown # for downloading files from google drive\n", - "\n", + "import shutil\n", "# ignore warnings\n", "import warnings\n", "import sys\n", "warnings.filterwarnings(\"ignore\")" ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def download_dataTMPsets():\n", - "\n", - " dict = {\n", - " \"brightkite\": [\"https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz\", \"https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz\"], \n", - " \"gowalla\": [\"https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz\", \"https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz\"], \n", - " \"foursquare\": \"https://drive.google.com/file/d/1PNk3zY8NjLcDiAbzjABzY5FiPAFHq6T8/view?usp=sharing\"}\n", - "\n", - " if not os.path.exists(\"dataTMP\"):\n", - " os.mkdir(\"dataTMP\")\n", - " print(\"Created dataTMP folder\")\n", - "\n", - " for folder in dict.keys():\n", - " if not os.path.exists(os.path.join(\"dataTMP\", folder)):\n", - " os.mkdir(os.path.join(\"dataTMP\", folder))\n", - " print(\"Created {} folder\".format(folder))\n", - "\n", - " for folder in dict.keys():\n", - " for url in dict[folder]:\n", - " if folder == \"foursquare\":\n", - " if not os.path.exists(os.path.join(\"dataTMP\", folder, \"foursquare_full.zip\")):\n", - " output = os.path.join(\"dataTMP\", folder, \"foursquare_full.zip\")\n", - " gdown.download(url, output, quiet=False, fuzzy=True)\n", - " else :\n", - " print(\"{} already downloaded\".format(url))\n", - " else:\n", - " if not os.path.exists(os.path.join(\"dataTMP\", folder, url.split(\"/\")[-1])):\n", - " print(\"Downloading {}...\".format(url))\n", - " wget.download(url, os.path.join(\"dataTMP\", folder))\n", - " else :\n", - " print(\"{} already downloaded\".format(url))\n", - "\n", - " for folder in dict.keys():\n", - " for file in os.listdir(os.path.join(\"dataTMP\", folder)):\n", - " if file.endswith(\".gz\"):\n", - " print(\"Unzipping {}...\".format(file))\n", - " os.system(\"gunzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n", - " elif file.endswith(\".zip\"):\n", - " print(\"Unzipping {}...\".format(file))\n", - " os.system(\"unzip {}\".format(os.path.join(\"dataTMP\", folder, file)))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Created dataTMP folder\n", - "Created brightkite folder\n", - "Created gowalla folder\n", - "Created foursquare folder\n", - "Downloading https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz...\n", - "Downloading https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz...\n", - "Downloading https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz...\n", - "Downloading https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz...\n" - ] - }, - { - "ename": "MissingSchema", - "evalue": "Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mMissingSchema\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m download_dataTMPsets()\n", - "Cell \u001b[0;32mIn[8], line 22\u001b[0m, in \u001b[0;36mdownload_dataTMPsets\u001b[0;34m()\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mexists(os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39m\"\u001b[39m\u001b[39mdataTMP\u001b[39m\u001b[39m\"\u001b[39m, folder, \u001b[39m\"\u001b[39m\u001b[39mfoursquare_full.zip\u001b[39m\u001b[39m\"\u001b[39m)):\n\u001b[1;32m 21\u001b[0m output \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39m\"\u001b[39m\u001b[39mdataTMP\u001b[39m\u001b[39m\"\u001b[39m, folder, \u001b[39m\"\u001b[39m\u001b[39mfoursquare_full.zip\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m---> 22\u001b[0m gdown\u001b[39m.\u001b[39;49mdownload(url, output, quiet\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, fuzzy\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 23\u001b[0m \u001b[39melse\u001b[39;00m :\n\u001b[1;32m 24\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m already downloaded\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(url))\n", - "File \u001b[0;32m/usr/lib/python3.10/site-packages/gdown/download.py:158\u001b[0m, in \u001b[0;36mdownload\u001b[0;34m(url, output, quiet, proxy, speed, use_cookies, verify, id, fuzzy, resume)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 157\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m res \u001b[39m=\u001b[39m sess\u001b[39m.\u001b[39;49mget(url, headers\u001b[39m=\u001b[39;49mheaders, stream\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, verify\u001b[39m=\u001b[39;49mverify)\n\u001b[1;32m 159\u001b[0m \u001b[39mexcept\u001b[39;00m requests\u001b[39m.\u001b[39mexceptions\u001b[39m.\u001b[39mProxyError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 160\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mAn error has occurred using proxy:\u001b[39m\u001b[39m\"\u001b[39m, proxy, file\u001b[39m=\u001b[39msys\u001b[39m.\u001b[39mstderr)\n", - "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:600\u001b[0m, in \u001b[0;36mSession.get\u001b[0;34m(self, url, **kwargs)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[39m\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\"\"Sends a GET request. Returns :class:`Response` object.\u001b[39;00m\n\u001b[1;32m 593\u001b[0m \n\u001b[1;32m 594\u001b[0m \u001b[39m:param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m 595\u001b[0m \u001b[39m:param \\*\\*kwargs: Optional arguments that ``request`` takes.\u001b[39;00m\n\u001b[1;32m 596\u001b[0m \u001b[39m:rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 597\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 599\u001b[0m kwargs\u001b[39m.\u001b[39msetdefault(\u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m--> 600\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrequest(\u001b[39m\"\u001b[39;49m\u001b[39mGET\u001b[39;49m\u001b[39m\"\u001b[39;49m, url, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", - "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:573\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[39m# Create the Request.\u001b[39;00m\n\u001b[1;32m 561\u001b[0m req \u001b[39m=\u001b[39m Request(\n\u001b[1;32m 562\u001b[0m method\u001b[39m=\u001b[39mmethod\u001b[39m.\u001b[39mupper(),\n\u001b[1;32m 563\u001b[0m url\u001b[39m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 571\u001b[0m hooks\u001b[39m=\u001b[39mhooks,\n\u001b[1;32m 572\u001b[0m )\n\u001b[0;32m--> 573\u001b[0m prep \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprepare_request(req)\n\u001b[1;32m 575\u001b[0m proxies \u001b[39m=\u001b[39m proxies \u001b[39mor\u001b[39;00m {}\n\u001b[1;32m 577\u001b[0m settings \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmerge_environment_settings(\n\u001b[1;32m 578\u001b[0m prep\u001b[39m.\u001b[39murl, proxies, stream, verify, cert\n\u001b[1;32m 579\u001b[0m )\n", - "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:484\u001b[0m, in \u001b[0;36mSession.prepare_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 481\u001b[0m auth \u001b[39m=\u001b[39m get_netrc_auth(request\u001b[39m.\u001b[39murl)\n\u001b[1;32m 483\u001b[0m p \u001b[39m=\u001b[39m PreparedRequest()\n\u001b[0;32m--> 484\u001b[0m p\u001b[39m.\u001b[39;49mprepare(\n\u001b[1;32m 485\u001b[0m method\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mmethod\u001b[39m.\u001b[39;49mupper(),\n\u001b[1;32m 486\u001b[0m url\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49murl,\n\u001b[1;32m 487\u001b[0m files\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mfiles,\n\u001b[1;32m 488\u001b[0m data\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mdata,\n\u001b[1;32m 489\u001b[0m json\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mjson,\n\u001b[1;32m 490\u001b[0m headers\u001b[39m=\u001b[39;49mmerge_setting(\n\u001b[1;32m 491\u001b[0m request\u001b[39m.\u001b[39;49mheaders, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mheaders, dict_class\u001b[39m=\u001b[39;49mCaseInsensitiveDict\n\u001b[1;32m 492\u001b[0m ),\n\u001b[1;32m 493\u001b[0m params\u001b[39m=\u001b[39;49mmerge_setting(request\u001b[39m.\u001b[39;49mparams, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mparams),\n\u001b[1;32m 494\u001b[0m auth\u001b[39m=\u001b[39;49mmerge_setting(auth, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mauth),\n\u001b[1;32m 495\u001b[0m cookies\u001b[39m=\u001b[39;49mmerged_cookies,\n\u001b[1;32m 496\u001b[0m hooks\u001b[39m=\u001b[39;49mmerge_hooks(request\u001b[39m.\u001b[39;49mhooks, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhooks),\n\u001b[1;32m 497\u001b[0m )\n\u001b[1;32m 498\u001b[0m \u001b[39mreturn\u001b[39;00m p\n", - "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/models.py:368\u001b[0m, in \u001b[0;36mPreparedRequest.prepare\u001b[0;34m(self, method, url, headers, files, data, params, auth, cookies, hooks, json)\u001b[0m\n\u001b[1;32m 365\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Prepares the entire request with the given parameters.\"\"\"\u001b[39;00m\n\u001b[1;32m 367\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_method(method)\n\u001b[0;32m--> 368\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprepare_url(url, params)\n\u001b[1;32m 369\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_headers(headers)\n\u001b[1;32m 370\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_cookies(cookies)\n", - "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/models.py:439\u001b[0m, in \u001b[0;36mPreparedRequest.prepare_url\u001b[0;34m(self, url, params)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[39mraise\u001b[39;00m InvalidURL(\u001b[39m*\u001b[39me\u001b[39m.\u001b[39margs)\n\u001b[1;32m 438\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m scheme:\n\u001b[0;32m--> 439\u001b[0m \u001b[39mraise\u001b[39;00m MissingSchema(\n\u001b[1;32m 440\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mInvalid URL \u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m!r}\u001b[39;00m\u001b[39m: No scheme supplied. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 441\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mPerhaps you meant http://\u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m}\u001b[39;00m\u001b[39m?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 442\u001b[0m )\n\u001b[1;32m 444\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m host:\n\u001b[1;32m 445\u001b[0m \u001b[39mraise\u001b[39;00m InvalidURL(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mInvalid URL \u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m!r}\u001b[39;00m\u001b[39m: No host supplied\u001b[39m\u001b[39m\"\u001b[39m)\n", - "\u001b[0;31mMissingSchema\u001b[0m: Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?" - ] - } - ], - "source": [ - "download_dataTMPsets()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "def download_dataTMPsets():\n", - "\n", - " urls = [\n", - " [\"https://snap.stanford.edu/dataTMP/loc-brightkite_edges.txt.gz\", \"https://snap.stanford.edu/dataTMP/loc-brightkite_totalCheckins.txt.gz\"],\n", - " [\"https://snap.stanford.edu/dataTMP/loc-gowalla_edges.txt.gz\", \"https://snap.stanford.edu/dataTMP/loc-gowalla_totalCheckins.txt.gz\"],\n", - " [\"https://drive.google.com/file/d/1PNk3zY8NjLcDiAbzjABzY5FiPAFHq6T8/view?usp=sharing\"]\n", - " ]\n", - "\n", - " folders = [\"brightkite\", \"gowalla\", \"foursquare\"]\n", - "\n", - " if not os.path.exists(\"dataTMP\"):\n", - " os.mkdir(\"dataTMP\")\n", - "\n", - " for folder in folders:\n", - " if not os.path.exists(os.path.join(\"dataTMP\", folder)):\n", - " os.mkdir(os.path.join(\"dataTMP\", folder))\n", - "\n", - " # Download every url in their respective folder. For the last one, we have to use gdown, because it's a google drive link. If the file is already downloaded, skip the download\n", - "\n", - " for i in range(len(urls)):\n", - " for url in urls[i]:\n", - " if not os.path.exists(os.path.join(\"dataTMP\", folders[i], url.split(\"/\")[-1])):\n", - " if i == 2:\n", - " output = os.path.join(\"dataTMP\", folders[i], \"something.zip\")\n", - " gdown.download(url, output, quiet=False, fuzzy=True)\n", - " else:\n", - " wget.download(url, os.path.join(\"dataTMP\", folders[i]))\n", - "\n", - "download_dataTMPsets()\n", - " # # unzip all the files in the 3 folders. Then remove the .gz or .zip files\n", - "\n", - " # for folder in folders:\n", - " # for file in os.listdir(os.path.join(\"dataTMP\", folder)):\n", - " # print(folder, file)\n", - " # if file.endswith(\".gz\"):\n", - " # os.system(\"gunzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n", - " # elif file.endswith(\".zip\"):\n", - " # os.system(\"unzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n", - " # os.remove(os.path.join(\"dataTMP\", folder, file))\n", - "\n", - " # # take all the .txt files from dataTMP/foursquare/dataTMPset_WWW2019 and move them to dataTMP/foursquare\n", - "\n", - " # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\")):\n", - " # if file.endswith(\".txt\"):\n", - " # os.rename(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\", file), os.path.join(\"dataTMP\", \"foursquare\", file))\n", - "\n", - " # # remove the dataTMPset_WWW2019 folder, note that is not empty\n", - " # # os.rmdir(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\"))\n", - "\n", - " # for file in [\"dataTMPset_WWW_friendship_old.txt\", \"dataTMPset_WWW_readme.txt\", \"raw_Checkins_anonymized.txt\", \"raw_POIs.txt\"]:\n", - " # os.remove(os.path.join(\"dataTMP\", \"foursquare\", file))\n", - "\n", - " # # Now we want to clean our dataTMP and rename the files.\n", - "\n", - " # for file in os.listdir(os.path.join(\"dataTMP\", \"brightkite\")):\n", - " # if file.endswith(\"_edges.txt\"):\n", - " # os.rename(os.path.join(\"dataTMP\", \"brightkite\", file), os.path.join(\"dataTMP\", \"brightkite\", \"brightkite_friends_edges.txt\"))\n", - "\n", - " # for file in os.listdir(os.path.join(\"dataTMP\", \"gowalla\")):\n", - " # if file.endswith(\"_edges.txt\"):\n", - " # os.rename(os.path.join(\"dataTMP\", \"gowalla\", file), os.path.join(\"dataTMP\", \"gowalla\", \"gowalla_friends_edges.txt\"))\n", - "\n", - " # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\")):\n", - " # if file.endswith(\"dataTMPset_WWW_friendship_new.txt\"):\n", - " # os.rename(os.path.join(\"dataTMP\", \"foursquare\", file), os.path.join(\"dataTMP\", \"foursquare\", \"foursquare_friends_edges.txt\"))\n", - "\n", - " # # Now we from the _totalCheckins.txt files we want to keep only the first and last column, which are the user ID and the venue ID. We also want to remove the header of the file.\n", - "\n", - " # for file in os.listdir(os.path.join(\"dataTMP\", \"brightkite\")):\n", - " # if file.endswith(\"_totalCheckins.txt\"):\n", - " # df = pd.read_csv(os.path.join(\"dataTMP\", \"brightkite\", file), sep=\"\\t\", header=None, names=[\"user_id\", \"check-in time\", \"latitude\", \"longitude\", \"venue_id\"])\n", - " # df[\"check-in time\"] = pd.to_datetime(df[\"check-in time\"])\n", - " # df = df[df[\"check-in time\"].dt.year == 2010]\n", - " # df = df.drop([\"check-in time\", \"latitude\", \"longitude\"], axis=1)\n", - " # df.to_csv(os.path.join(\"dataTMP\", \"brightkite\", \"brightkite_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n", - " # os.remove(os.path.join(\"dataTMP\", \"brightkite\", file))\n", - "\n", - " # for file in os.listdir(os.path.join(\"dataTMP\", \"gowalla\")):\n", - " # if file.endswith(\"_totalCheckins.txt\"):\n", - " # df = pd.read_csv(os.path.join(\"dataTMP\", \"gowalla\", file), sep=\"\\t\", header=None, names=[\"user_id\", \"check-in time\", \"latitude\", \"longitude\", \"venue_id\"])\n", - " # df[\"check-in time\"] = pd.to_datetime(df[\"check-in time\"])\n", - " # df = df[df[\"check-in time\"].dt.year == 2010]\n", - " # df = df.drop([\"check-in time\", \"latitude\", \"longitude\"], axis=1)\n", - " # df.to_csv(os.path.join(\"dataTMP\", \"gowalla\", \"gowalla_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n", - " # os.remove(os.path.join(\"dataTMP\", \"gowalla\", file))\n", - "\n", - " # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\")):\n", - " # if file.endswith(\"dataTMPset_WWW_Checkins_anonymized.txt\"):\n", - " # df = pd.read_csv(os.path.join(\"dataTMP\", \"foursquare\", file), sep=\"\\t\", header=None)\n", - " # df = df[[0, 1]]\n", - " # df.to_csv(os.path.join(\"dataTMP\", \"foursquare\", \"foursquare_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n", - " # os.remove(os.path.join(\"dataTMP\", \"foursquare\", file))\n" - ] } ], "metadata": { diff --git a/utils.py b/utils.py index 772cf9f..b60aad3 100755 --- a/utils.py +++ b/utils.py @@ -22,100 +22,100 @@ from collections import Counter import numpy as np import gdown from networkx.utils import py_random_state +import shutil # ------------------------------------------------------------------------# def download_datasets(): - urls = [ - ["https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz", "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"], - ["https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz", "https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz"], - ["https://drive.google.com/file/d/1PNk3zY8NjLcDiAbzjABzY5FiPAFHq6T8/view?usp=sharing"] - ] + """ + Download the datasets from the web and unzip them. The datasets are downloaded from the SNAP website and from a Google Drive folder. + + Parameters + ---------- + None - folders = ["brightkite", "gowalla", "foursquare"] + Returns + ------- + None + + Notes + ----- + The datasets are downloaded in the "data" folder. If the folder doesn't exist, it will be created. If the dataset is already downloaded, it will be skipped. The files are renamed to make them more readable. + """ + + + dict = { + "brightkite": ["https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz", "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"], + "gowalla": ["https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz", "https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz"], + "foursquare": ["https://drive.google.com/file/d/1PNk3zY8NjLcDiAbzjABzY5FiPAFHq6T8/view?usp=sharing"] + } if not os.path.exists("data"): os.mkdir("data") + print("Created data folder") - for folder in folders: + for folder in dict.keys(): if not os.path.exists(os.path.join("data", folder)): os.mkdir(os.path.join("data", folder)) + print("Created {} folder".format(folder)) - # Download every url in their respective folder. For the last one, we have to use gdown, because it's a google drive link. If the file is already downloaded, skip the download + ## DOWNLOADING ## - for i in range(len(urls)): - for url in urls[i]: - if not os.path.exists(os.path.join("data", folders[i], url.split("/")[-1])): - if i == 2: - output = os.path.join("data", folders[i], "something.zip") + for folder in dict.keys(): + for url in dict[folder]: + if folder == "foursquare": + if not os.path.exists(os.path.join("data", folder, "foursquare_full.zip")): + output = os.path.join("data", folder, "foursquare_full.zip") gdown.download(url, output, quiet=False, fuzzy=True) - else: - wget.download(url, os.path.join("data", folders[i])) - - # unzip all the files in the 3 folders. Then remove the .gz or .zip files - - for folder in folders: + else : + print("{} already downloaded".format(url)) + else: + if not os.path.exists(os.path.join("data", folder, url.split("/")[-1])): + print("Downloading {}...".format(url)) + wget.download(url, os.path.join("data", folder)) + else : + print("{} already downloaded".format(url)) + + ## UNZIPPING ## + + for folder in dict.keys(): for file in os.listdir(os.path.join("data", folder)): - print(folder, file) if file.endswith(".gz"): + print("Unzipping {}...".format(file)) os.system("gunzip {}".format(os.path.join("data", folder, file))) elif file.endswith(".zip"): - os.system("unzip {}".format(os.path.join("data", folder, file))) + print("Unzipping {}...".format(file)) + os.system("unzip -o {} -d {}".format(os.path.join("data", folder, file), os.path.join("data", folder))) os.remove(os.path.join("data", folder, file)) - # take all the .txt files from data/foursquare/dataset_WWW2019 and move them to data/foursquare + ## FOURSQUARE CLEANING ## for file in os.listdir(os.path.join("data", "foursquare", "dataset_WWW2019")): if file.endswith(".txt"): os.rename(os.path.join("data", "foursquare", "dataset_WWW2019", file), os.path.join("data", "foursquare", file)) - # remove the dataset_WWW2019 folder, note that is not empty - # os.rmdir(os.path.join("data", "foursquare", "dataset_WWW2019")) - - for file in ["dataset_WWW_friendship_old.txt", "dataset_WWW_readme.txt", "raw_Checkins_anonymized.txt", "raw_POIs.txt"]: + for file in ["dataset_WWW_friendship_old.txt", "dataset_WWW_readme.txt", "raw_Checkins_anonymized.txt"]: os.remove(os.path.join("data", "foursquare", file)) - # Now we want to clean our data and rename the files. - - for file in os.listdir(os.path.join("data", "brightkite")): - if file.endswith("_edges.txt"): - os.rename(os.path.join("data", "brightkite", file), os.path.join("data", "brightkite", "brightkite_friends_edges.txt")) - - for file in os.listdir(os.path.join("data", "gowalla")): - if file.endswith("_edges.txt"): - os.rename(os.path.join("data", "gowalla", file), os.path.join("data", "gowalla", "gowalla_friends_edges.txt")) - - for file in os.listdir(os.path.join("data", "foursquare")): - if file.endswith("dataset_WWW_friendship_new.txt"): - os.rename(os.path.join("data", "foursquare", file), os.path.join("data", "foursquare", "foursquare_friends_edges.txt")) - - # Now we from the _totalCheckins.txt files we want to keep only the first and last column, which are the user ID and the venue ID. We also want to remove the header of the file. - - for file in os.listdir(os.path.join("data", "brightkite")): - if file.endswith("_totalCheckins.txt"): - df = pd.read_csv(os.path.join("data", "brightkite", file), sep="\t", header=None, names=["user_id", "check-in time", "latitude", "longitude", "venue_id"]) - df["check-in time"] = pd.to_datetime(df["check-in time"]) - df = df[df["check-in time"].dt.year == 2010] - df = df.drop(["check-in time", "latitude", "longitude"], axis=1) - df.to_csv(os.path.join("data", "brightkite", "brightkite_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8") - os.remove(os.path.join("data", "brightkite", file)) - - for file in os.listdir(os.path.join("data", "gowalla")): - if file.endswith("_totalCheckins.txt"): - df = pd.read_csv(os.path.join("data", "gowalla", file), sep="\t", header=None, names=["user_id", "check-in time", "latitude", "longitude", "venue_id"]) - df["check-in time"] = pd.to_datetime(df["check-in time"]) - df = df[df["check-in time"].dt.year == 2010] - df = df.drop(["check-in time", "latitude", "longitude"], axis=1) - df.to_csv(os.path.join("data", "gowalla", "gowalla_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8") - os.remove(os.path.join("data", "gowalla", file)) - - for file in os.listdir(os.path.join("data", "foursquare")): - if file.endswith("dataset_WWW_Checkins_anonymized.txt"): - df = pd.read_csv(os.path.join("data", "foursquare", file), sep="\t", header=None) - df = df[[0, 1]] - df.to_csv(os.path.join("data", "foursquare", "foursquare_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8") - os.remove(os.path.join("data", "foursquare", file)) + shutil.rmtree(os.path.join("data", "foursquare", "dataset_WWW2019")) + shutil.rmtree(os.path.join("data", "foursquare", "__MACOSX")) + + os.rename(os.path.join("data", "foursquare", "dataset_WWW_friendship_new.txt"), os.path.join("data", "foursquare", "foursquare_friends_edges.txt")) + + os.rename(os.path.join("data", "foursquare", "dataset_WWW_Checkins_anonymized.txt"), os.path.join("data", "foursquare", "foursquare_checkins.txt")) + + ## BRIGHTKITE CLEANING ## + + os.rename(os.path.join("data", "brightkite", "loc-brightkite_totalCheckins.txt"), os.path.join("data", "brightkite", "brightkite_checkins.txt")) + + os.rename(os.path.join("data", "brightkite", "loc-brightkite_edges.txt"), os.path.join("data", "brightkite", "brightkite_friends_edges.txt")) + + ## GOWALLA CLEANING ## + + os.rename(os.path.join("data", "gowalla", "loc-gowalla_totalCheckins.txt"), os.path.join("data", "gowalla", "gowalla_checkins.txt")) + + os.rename(os.path.join("data", "gowalla", "loc-gowalla_edges.txt"), os.path.join("data", "gowalla", "gowalla_friends_edges.txt")) # ------------------------------------------------------------------------# @@ -145,7 +145,7 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq if dataset not in ['brightkite', 'gowalla', 'foursquare']: raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquare") - + file = os.path.join("data", dataset, dataset + "_checkins.txt") print("\nCreating the graph for the dataset {}...".format(dataset)) @@ -173,7 +173,7 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq return G - + # ------------------------------------------------------------------------# def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT']) -> nx.Graph: @@ -200,7 +200,7 @@ def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursqua if dataset not in ["brightkite", "gowalla", "foursquare"]: raise ValueError("The dataset must be brightkite, gowalla or foursquare") - + file = os.path.join("data", dataset, dataset + "_friends_edges.txt") df_friends_all = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"], engine='pyarrow') @@ -469,7 +469,7 @@ def average_clustering_coefficient(G: nx.Graph, k=None) -> float: def generalized_average_clustering_coefficient(G: nx.Graph) -> float: - + """ Generalized definition of the average clustering coefficient of a graph. It better applies to small world networks and it's way more efficient than the average_clustering_coefficient function with the standard definition of the clustering coefficient. @@ -483,7 +483,7 @@ def generalized_average_clustering_coefficient(G: nx.Graph) -> float: float The generalized average clustering coefficient of the graph. """ - + C = 0 for node in G.nodes(): k = G.degree(node) @@ -547,5 +547,3 @@ def create_random_graphs(G: nx.Graph, model = None, save = True) -> nx.Graph: print("\tThe file graph has been saved in the folder data/random/watts_strogatz with the syntax watts_strogatz_n_nodes_n_edges.gpickle") return G_random - - \ No newline at end of file