@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1 ,
"execution_count": null ,
"metadata": {},
"outputs": [],
"source": [
@ -16,372 +16,106 @@
"import pandas as pd\n",
"import networkx as nx\n",
"import plotly.graph_objects as go\n",
"from utils import *\n",
"# from utils import *\n",
"from collections import Counter\n",
"from tqdm import tqdm\n",
"import time\n",
"import geopandas as gpd\n",
"import gdown # for downloading files from google drive\n",
"\n",
"# ignore warnings\n",
"import warnings\n",
"import sys\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Graph</th>\n",
" <th>Number of Nodes</th>\n",
" <th>Number of Edges</th>\n",
" <th>Average Degree</th>\n",
" <th>Average Clustering Coefficient</th>\n",
" <th>log N</th>\n",
" <th>Average Shortest Path Length</th>\n",
" <th>betweenness centrality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Brightkite Checkins Graph</td>\n",
" <td>7191</td>\n",
" <td>3663807</td>\n",
" <td>1018.997914</td>\n",
" <td>0.702854</td>\n",
" <td>8.880586</td>\n",
" <td>2.411011</td>\n",
" <td>0.00022</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Gowalla Checkins Graph</td>\n",
" <td>10702</td>\n",
" <td>303104</td>\n",
" <td>56.644366</td>\n",
" <td>0.505597</td>\n",
" <td>9.278186</td>\n",
" <td>5.222903</td>\n",
" <td>0.000301</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Foursquare EU Checkins Graph</td>\n",
" <td>20282</td>\n",
" <td>7430376</td>\n",
" <td>732.706439</td>\n",
" <td>0.597097</td>\n",
" <td>9.917489</td>\n",
" <td>2.2843</td>\n",
" <td>0.000089</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Foursquare IT Checkins Graph</td>\n",
" <td>3730</td>\n",
" <td>629749</td>\n",
" <td>337.667024</td>\n",
" <td>0.683565</td>\n",
" <td>8.224164</td>\n",
" <td>2.185477</td>\n",
" <td>0.000428</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Brightkite Friendship Graph</td>\n",
" <td>5928</td>\n",
" <td>34673</td>\n",
" <td>11.698043</td>\n",
" <td>0.219749</td>\n",
" <td>8.687442</td>\n",
" <td>5.052162</td>\n",
" <td>0.000448</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>(Filtered) Gowalla Friendship Graph</td>\n",
" <td>8396</td>\n",
" <td>29122</td>\n",
" <td>6.937113</td>\n",
" <td>0.217544</td>\n",
" <td>9.035511</td>\n",
" <td>4.558532</td>\n",
" <td>0.000357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Foursquare IT Friendship Graph</td>\n",
" <td>2073</td>\n",
" <td>6217</td>\n",
" <td>5.99807</td>\n",
" <td>0.148489</td>\n",
" <td>7.636752</td>\n",
" <td>19.530752</td>\n",
" <td>0.000879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Foursquare EU Friendship Graph</td>\n",
" <td>16491</td>\n",
" <td>59419</td>\n",
" <td>7.206234</td>\n",
" <td>0.167946</td>\n",
" <td>9.710570</td>\n",
" <td>23.713864</td>\n",
" <td>0.000272</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Graph Number of Nodes Number of Edges \\\n",
"0 Brightkite Checkins Graph 7191 3663807 \n",
"1 Gowalla Checkins Graph 10702 303104 \n",
"2 Foursquare EU Checkins Graph 20282 7430376 \n",
"3 Foursquare IT Checkins Graph 3730 629749 \n",
"4 Brightkite Friendship Graph 5928 34673 \n",
"5 (Filtered) Gowalla Friendship Graph 8396 29122 \n",
"6 Foursquare IT Friendship Graph 2073 6217 \n",
"7 Foursquare EU Friendship Graph 16491 59419 \n",
"\n",
" Average Degree Average Clustering Coefficient log N \\\n",
"0 1018.997914 0.702854 8.880586 \n",
"1 56.644366 0.505597 9.278186 \n",
"2 732.706439 0.597097 9.917489 \n",
"3 337.667024 0.683565 8.224164 \n",
"4 11.698043 0.219749 8.687442 \n",
"5 6.937113 0.217544 9.035511 \n",
"6 5.99807 0.148489 7.636752 \n",
"7 7.206234 0.167946 9.710570 \n",
"\n",
" Average Shortest Path Length betweenness centrality \n",
"0 2.411011 0.00022 \n",
"1 5.222903 0.000301 \n",
"2 2.2843 0.000089 \n",
"3 2.185477 0.000428 \n",
"4 5.052162 0.000448 \n",
"5 4.558532 0.000357 \n",
"6 19.530752 0.000879 \n",
"7 23.713864 0.000272 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import the graphs from the saved files\n",
"G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
"G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n",
"G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n",
"G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n",
"\n",
"G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n",
"G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n",
"G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n",
"G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n",
"\n",
"# open the dataframe object\n",
"analysis_results = pd.read_pickle('analysis_results.pkl')\n",
"analysis_results"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"The first thing that we want to do is very simple, create a random reference for each graph"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
"friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
"\n",
"graphs_all = checkins_graphs + friendships_graph"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Random shit"
"def download_dataTMPsets():\n",
"\n",
" dict = {\n",
" \"brightkite\": [\"https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz\", \"https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz\"], \n",
" \"gowalla\": [\"https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz\", \"https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz\"], \n",
" \"foursquare\": \"https://drive.google.com/file/d/1PNk3zY8NjLcDiAbzjABzY5FiPAFHq6T8/view?usp=sharing\"}\n",
"\n",
" if not os.path.exists(\"dataTMP\"):\n",
" os.mkdir(\"dataTMP\")\n",
" print(\"Created dataTMP folder\")\n",
"\n",
" for folder in dict.keys():\n",
" if not os.path.exists(os.path.join(\"dataTMP\", folder)):\n",
" os.mkdir(os.path.join(\"dataTMP\", folder))\n",
" print(\"Created {} folder\".format(folder))\n",
"\n",
" for folder in dict.keys():\n",
" for url in dict[folder]:\n",
" if folder == \"foursquare\":\n",
" if not os.path.exists(os.path.join(\"dataTMP\", folder, \"foursquare_full.zip\")):\n",
" output = os.path.join(\"dataTMP\", folder, \"foursquare_full.zip\")\n",
" gdown.download(url, output, quiet=False, fuzzy=True)\n",
" else :\n",
" print(\"{} already downloaded\".format(url))\n",
" else:\n",
" if not os.path.exists(os.path.join(\"dataTMP\", folder, url.split(\"/\")[-1])):\n",
" print(\"Downloading {}...\".format(url))\n",
" wget.download(url, os.path.join(\"dataTMP\", folder))\n",
" else :\n",
" print(\"{} already downloaded\".format(url))\n",
"\n",
" for folder in dict.keys():\n",
" for file in os.listdir(os.path.join(\"dataTMP\", folder)):\n",
" if file.endswith(\".gz\"):\n",
" print(\"Unzipping {}...\".format(file))\n",
" os.system(\"gunzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n",
" elif file.endswith(\".zip\"):\n",
" print(\"Unzipping {}...\".format(file))\n",
" os.system(\"unzip {}\".format(os.path.join(\"dataTMP\", folder, file)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"# analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"# for graph in graphs_all:\n",
"# print(\"\\nCreating random graph for graph: \", graph.name)\n",
"# G_erd = create_random_graphs(graph, model='erdos', save=False)\n",
"# G_ws = create_random_graphs(graph, model='watts_strogatz', save=False)\n",
" \n",
"# # add the basic information to the dataframe\n",
"# analysis_results_erods = analysis_results_erods.append({\n",
"# 'Graph': G_erd.name,\n",
"# 'Number of Nodes': G_erd.number_of_nodes(),\n",
"# 'Number of Edges': G_erd.number_of_edges(),\n",
"# 'log N': np.log(G_erd.number_of_nodes())\n",
"# }, ignore_index=True)\n",
"\n",
"# # add the basic information to the dataframe\n",
"# analysis_results_ws = analysis_results_ws.append({\n",
"# 'Graph': G_ws.name,\n",
"# 'Number of Nodes': G_ws.number_of_nodes(),\n",
"# 'Number of Edges': G_ws.number_of_edges(),\n",
"# 'log N': np.log(G_ws.number_of_nodes())\n",
"# }, ignore_index=True)\n",
"\n",
"# # compute the average degree and add it to the dataframes\n",
"# avg_deg_erd = np.mean([d for n, d in G_erd.degree()])\n",
"# avg_deg_ws = np.mean([d for n, d in G_ws.degree()])\n",
"# analysis_results_erods.loc[analysis_results_erods['Graph'] == G_erd.name, 'Average Degree'] = avg_deg_erd\n",
"# analysis_results_ws.loc[analysis_results_ws['Graph'] == G_ws.name, 'Average Degree'] = avg_deg_ws\n",
"\n",
"# # compute the average clustering coefficient and add it to the dataframes\n",
"# avg_clustering_erd = average_clustering_coefficient(G_erd, k = 0.9)\n",
"# avg_clustering_ws = average_clustering_coefficient(G_ws, k = 0.9)\n",
"# analysis_results_erods.loc[analysis_results_erods['Graph'] == G_erd.name, 'Average Clustering Coefficient'] = avg_clustering_erd\n",
"# analysis_results_ws.loc[analysis_results_ws['Graph'] == G_ws.name, 'Average Clustering Coefficient'] = avg_clustering_ws\n",
"\n",
"# # compute the average shortest path length and add it to the dataframes\n",
"# average_shortest_path_length_erd = average_shortest_path(G_erd, k = 0.9)\n",
"# average_shortest_path_length_ws = average_shortest_path(G_ws, k = 0.9)\n",
"# analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length_erd\n",
"# analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length_ws\n",
"\n",
"# # compute the betweenness centrality and add it to the dataframes\n",
"# betweenness_centrality_erd = np.mean(list(betweenness_centrality_parallel(G_erd, 4, k = 0.9).values()))\n",
"# betweenness_centrality_ws = np.mean(list(betweenness_centrality_parallel(G_ws, 4, k = 0.9).values()))\n",
"# analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality_erd\n",
"# analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality_ws\n",
"\n",
"# # save memory\n",
"# del G_erd, G_ws\n",
"\n",
"# analysis_results_erods.to_pickle('analysis_results_erods.pkl')\n",
"# analysis_results_ws.to_pickle('analysis_results_ws.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Small Worldness\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We have already computed the average clusting coefficient and the average shortesh path len for our networks. We can save a lot of time by skipping this computations"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def omega(G, C_og, L_og, niter, nrand):\n",
" randMetrics = {\"C\": [], \"L\": []}\n",
"\n",
" # Calculate initial average clustering coefficient which potentially will\n",
" # get replaced by higher clustering coefficients from generated lattice\n",
" # reference graphs\n",
" Cl = C_og\n",
"\n",
" niter_lattice_reference = niter\n",
" niter_random_reference = niter * 2\n",
"\n",
" for _ in range(nrand):\n",
" \n",
" # Generate random graph\n",
" Gr = nx.random_reference(G, niter=niter_random_reference, seed=42)\n",
" randMetrics[\"L\"].append(nx.average_shortest_path_length(Gr))\n",
"\n",
" # Generate lattice graph\n",
" Gl = nx.lattice_reference(G, niter=niter_lattice_reference, seed=42)\n",
"\n",
" # Replace old clustering coefficient, if clustering is higher in\n",
" # generated lattice reference\n",
" Cl_temp = nx.average_clustering(Gl)\n",
" if Cl_temp > Cl:\n",
" Cl = Cl_temp\n",
"\n",
" C = C_og\n",
" L = L_og\n",
" Lr = np.mean(randMetrics[\"L\"])\n",
"\n",
" omega = (Lr / L) - (C / Cl)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Brightkite Checkins Graph\n"
"Created dataTMP folder\n",
"Created brightkite folder\n",
"Created gowalla folder\n",
"Created foursquare folder\n",
"Downloading https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz...\n",
"Downloading https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz...\n",
"Downloading https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz...\n",
"Downloading https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz...\n"
]
},
{
"ename": "MissingSchema",
"evalue": "Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mMissingSchema\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m download_dataTMPsets()\n",
"Cell \u001b[0;32mIn[8], line 22\u001b[0m, in \u001b[0;36mdownload_dataTMPsets\u001b[0;34m()\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mexists(os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39m\"\u001b[39m\u001b[39mdataTMP\u001b[39m\u001b[39m\"\u001b[39m, folder, \u001b[39m\"\u001b[39m\u001b[39mfoursquare_full.zip\u001b[39m\u001b[39m\"\u001b[39m)):\n\u001b[1;32m 21\u001b[0m output \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39m\"\u001b[39m\u001b[39mdataTMP\u001b[39m\u001b[39m\"\u001b[39m, folder, \u001b[39m\"\u001b[39m\u001b[39mfoursquare_full.zip\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m---> 22\u001b[0m gdown\u001b[39m.\u001b[39;49mdownload(url, output, quiet\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, fuzzy\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 23\u001b[0m \u001b[39melse\u001b[39;00m :\n\u001b[1;32m 24\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m already downloaded\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(url))\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/gdown/download.py:158\u001b[0m, in \u001b[0;36mdownload\u001b[0;34m(url, output, quiet, proxy, speed, use_cookies, verify, id, fuzzy, resume)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 157\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m res \u001b[39m=\u001b[39m sess\u001b[39m.\u001b[39;49mget(url, headers\u001b[39m=\u001b[39;49mheaders, stream\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, verify\u001b[39m=\u001b[39;49mverify)\n\u001b[1;32m 159\u001b[0m \u001b[39mexcept\u001b[39;00m requests\u001b[39m.\u001b[39mexceptions\u001b[39m.\u001b[39mProxyError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 160\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mAn error has occurred using proxy:\u001b[39m\u001b[39m\"\u001b[39m, proxy, file\u001b[39m=\u001b[39msys\u001b[39m.\u001b[39mstderr)\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:600\u001b[0m, in \u001b[0;36mSession.get\u001b[0;34m(self, url, **kwargs)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[39m\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\"\"Sends a GET request. Returns :class:`Response` object.\u001b[39;00m\n\u001b[1;32m 593\u001b[0m \n\u001b[1;32m 594\u001b[0m \u001b[39m:param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m 595\u001b[0m \u001b[39m:param \\*\\*kwargs: Optional arguments that ``request`` takes.\u001b[39;00m\n\u001b[1;32m 596\u001b[0m \u001b[39m:rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 597\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 599\u001b[0m kwargs\u001b[39m.\u001b[39msetdefault(\u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m--> 600\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrequest(\u001b[39m\"\u001b[39;49m\u001b[39mGET\u001b[39;49m\u001b[39m\"\u001b[39;49m, url, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:573\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[39m# Create the Request.\u001b[39;00m\n\u001b[1;32m 561\u001b[0m req \u001b[39m=\u001b[39m Request(\n\u001b[1;32m 562\u001b[0m method\u001b[39m=\u001b[39mmethod\u001b[39m.\u001b[39mupper(),\n\u001b[1;32m 563\u001b[0m url\u001b[39m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 571\u001b[0m hooks\u001b[39m=\u001b[39mhooks,\n\u001b[1;32m 572\u001b[0m )\n\u001b[0;32m--> 573\u001b[0m prep \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprepare_request(req)\n\u001b[1;32m 575\u001b[0m proxies \u001b[39m=\u001b[39m proxies \u001b[39mor\u001b[39;00m {}\n\u001b[1;32m 577\u001b[0m settings \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmerge_environment_settings(\n\u001b[1;32m 578\u001b[0m prep\u001b[39m.\u001b[39murl, proxies, stream, verify, cert\n\u001b[1;32m 579\u001b[0m )\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:484\u001b[0m, in \u001b[0;36mSession.prepare_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 481\u001b[0m auth \u001b[39m=\u001b[39m get_netrc_auth(request\u001b[39m.\u001b[39murl)\n\u001b[1;32m 483\u001b[0m p \u001b[39m=\u001b[39m PreparedRequest()\n\u001b[0;32m--> 484\u001b[0m p\u001b[39m.\u001b[39;49mprepare(\n\u001b[1;32m 485\u001b[0m method\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mmethod\u001b[39m.\u001b[39;49mupper(),\n\u001b[1;32m 486\u001b[0m url\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49murl,\n\u001b[1;32m 487\u001b[0m files\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mfiles,\n\u001b[1;32m 488\u001b[0m data\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mdata,\n\u001b[1;32m 489\u001b[0m json\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mjson,\n\u001b[1;32m 490\u001b[0m headers\u001b[39m=\u001b[39;49mmerge_setting(\n\u001b[1;32m 491\u001b[0m request\u001b[39m.\u001b[39;49mheaders, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mheaders, dict_class\u001b[39m=\u001b[39;49mCaseInsensitiveDict\n\u001b[1;32m 492\u001b[0m ),\n\u001b[1;32m 493\u001b[0m params\u001b[39m=\u001b[39;49mmerge_setting(request\u001b[39m.\u001b[39;49mparams, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mparams),\n\u001b[1;32m 494\u001b[0m auth\u001b[39m=\u001b[39;49mmerge_setting(auth, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mauth),\n\u001b[1;32m 495\u001b[0m cookies\u001b[39m=\u001b[39;49mmerged_cookies,\n\u001b[1;32m 496\u001b[0m hooks\u001b[39m=\u001b[39;49mmerge_hooks(request\u001b[39m.\u001b[39;49mhooks, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhooks),\n\u001b[1;32m 497\u001b[0m )\n\u001b[1;32m 498\u001b[0m \u001b[39mreturn\u001b[39;00m p\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/models.py:368\u001b[0m, in \u001b[0;36mPreparedRequest.prepare\u001b[0;34m(self, method, url, headers, files, data, params, auth, cookies, hooks, json)\u001b[0m\n\u001b[1;32m 365\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Prepares the entire request with the given parameters.\"\"\"\u001b[39;00m\n\u001b[1;32m 367\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_method(method)\n\u001b[0;32m--> 368\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprepare_url(url, params)\n\u001b[1;32m 369\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_headers(headers)\n\u001b[1;32m 370\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_cookies(cookies)\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/models.py:439\u001b[0m, in \u001b[0;36mPreparedRequest.prepare_url\u001b[0;34m(self, url, params)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[39mraise\u001b[39;00m InvalidURL(\u001b[39m*\u001b[39me\u001b[39m.\u001b[39margs)\n\u001b[1;32m 438\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m scheme:\n\u001b[0;32m--> 439\u001b[0m \u001b[39mraise\u001b[39;00m MissingSchema(\n\u001b[1;32m 440\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mInvalid URL \u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m!r}\u001b[39;00m\u001b[39m: No scheme supplied. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 441\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mPerhaps you meant http://\u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m}\u001b[39;00m\u001b[39m?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 442\u001b[0m )\n\u001b[1;32m 444\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m host:\n\u001b[1;32m 445\u001b[0m \u001b[39mraise\u001b[39;00m InvalidURL(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mInvalid URL \u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m!r}\u001b[39;00m\u001b[39m: No host supplied\u001b[39m\u001b[39m\"\u001b[39m)\n",
"\u001b[0;31mMissingSchema\u001b[0m: Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?"
]
}
],
"source": [
"analysis_results = pd.read_pickle('analysis_results.pkl')\n",
"\n",
"omega_results = pd.DataFrame(columns=['Graph', 'omega'])\n",
"\n",
"for G in checkins_graphs:\n",
" print(G.name)\n",
" C_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Clustering Coefficient'].values[0]\n",
" L_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Shortest Path Length'].values[0]\n",
"\n",
" omega = omega(G, C_og, L_og, 2, 3)\n",
" \n",
" omega_results = omega_results.append({\n",
" 'Graph': G.name,\n",
" 'omega': omega\n",
" }, ignore_index=True)"
"download_dataTMPsets()"
]
},
{
@ -390,17 +124,99 @@
"metadata": {},
"outputs": [],
"source": [
"for G in friendships_graphs:\n",
" print(G.name)\n",
" C_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Clustering Coefficient'].values[0]\n",
" L_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Shortest Path Length'].values[0]\n",
"\n",
" omega = omega(G, C_og, L_og, 2, 3)\n",
" \n",
" omega_results = omega_results.append({\n",
" 'Graph': G.name,\n",
" 'omega': omega\n",
" }, ignore_index=True)"
"def download_dataTMPsets():\n",
"\n",
" urls = [\n",
" [\"https://snap.stanford.edu/dataTMP/loc-brightkite_edges.txt.gz\", \"https://snap.stanford.edu/dataTMP/loc-brightkite_totalCheckins.txt.gz\"],\n",
" [\"https://snap.stanford.edu/dataTMP/loc-gowalla_edges.txt.gz\", \"https://snap.stanford.edu/dataTMP/loc-gowalla_totalCheckins.txt.gz\"],\n",
" [\"https://drive.google.com/file/d/1PNk3zY8NjLcDiAbzjABzY5FiPAFHq6T8/view?usp=sharing\"]\n",
" ]\n",
"\n",
" folders = [\"brightkite\", \"gowalla\", \"foursquare\"]\n",
"\n",
" if not os.path.exists(\"dataTMP\"):\n",
" os.mkdir(\"dataTMP\")\n",
"\n",
" for folder in folders:\n",
" if not os.path.exists(os.path.join(\"dataTMP\", folder)):\n",
" os.mkdir(os.path.join(\"dataTMP\", folder))\n",
"\n",
" # Download every url in their respective folder. For the last one, we have to use gdown, because it's a google drive link. If the file is already downloaded, skip the download\n",
"\n",
" for i in range(len(urls)):\n",
" for url in urls[i]:\n",
" if not os.path.exists(os.path.join(\"dataTMP\", folders[i], url.split(\"/\")[-1])):\n",
" if i == 2:\n",
" output = os.path.join(\"dataTMP\", folders[i], \"something.zip\")\n",
" gdown.download(url, output, quiet=False, fuzzy=True)\n",
" else:\n",
" wget.download(url, os.path.join(\"dataTMP\", folders[i]))\n",
"\n",
"download_dataTMPsets()\n",
" # # unzip all the files in the 3 folders. Then remove the .gz or .zip files\n",
"\n",
" # for folder in folders:\n",
" # for file in os.listdir(os.path.join(\"dataTMP\", folder)):\n",
" # print(folder, file)\n",
" # if file.endswith(\".gz\"):\n",
" # os.system(\"gunzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n",
" # elif file.endswith(\".zip\"):\n",
" # os.system(\"unzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n",
" # os.remove(os.path.join(\"dataTMP\", folder, file))\n",
"\n",
" # # take all the .txt files from dataTMP/foursquare/dataTMPset_WWW2019 and move them to dataTMP/foursquare\n",
"\n",
" # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\")):\n",
" # if file.endswith(\".txt\"):\n",
" # os.rename(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\", file), os.path.join(\"dataTMP\", \"foursquare\", file))\n",
"\n",
" # # remove the dataTMPset_WWW2019 folder, note that is not empty\n",
" # # os.rmdir(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\"))\n",
"\n",
" # for file in [\"dataTMPset_WWW_friendship_old.txt\", \"dataTMPset_WWW_readme.txt\", \"raw_Checkins_anonymized.txt\", \"raw_POIs.txt\"]:\n",
" # os.remove(os.path.join(\"dataTMP\", \"foursquare\", file))\n",
"\n",
" # # Now we want to clean our dataTMP and rename the files.\n",
"\n",
" # for file in os.listdir(os.path.join(\"dataTMP\", \"brightkite\")):\n",
" # if file.endswith(\"_edges.txt\"):\n",
" # os.rename(os.path.join(\"dataTMP\", \"brightkite\", file), os.path.join(\"dataTMP\", \"brightkite\", \"brightkite_friends_edges.txt\"))\n",
"\n",
" # for file in os.listdir(os.path.join(\"dataTMP\", \"gowalla\")):\n",
" # if file.endswith(\"_edges.txt\"):\n",
" # os.rename(os.path.join(\"dataTMP\", \"gowalla\", file), os.path.join(\"dataTMP\", \"gowalla\", \"gowalla_friends_edges.txt\"))\n",
"\n",
" # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\")):\n",
" # if file.endswith(\"dataTMPset_WWW_friendship_new.txt\"):\n",
" # os.rename(os.path.join(\"dataTMP\", \"foursquare\", file), os.path.join(\"dataTMP\", \"foursquare\", \"foursquare_friends_edges.txt\"))\n",
"\n",
" # # Now we from the _totalCheckins.txt files we want to keep only the first and last column, which are the user ID and the venue ID. We also want to remove the header of the file.\n",
"\n",
" # for file in os.listdir(os.path.join(\"dataTMP\", \"brightkite\")):\n",
" # if file.endswith(\"_totalCheckins.txt\"):\n",
" # df = pd.read_csv(os.path.join(\"dataTMP\", \"brightkite\", file), sep=\"\\t\", header=None, names=[\"user_id\", \"check-in time\", \"latitude\", \"longitude\", \"venue_id\"])\n",
" # df[\"check-in time\"] = pd.to_datetime(df[\"check-in time\"])\n",
" # df = df[df[\"check-in time\"].dt.year == 2010]\n",
" # df = df.drop([\"check-in time\", \"latitude\", \"longitude\"], axis=1)\n",
" # df.to_csv(os.path.join(\"dataTMP\", \"brightkite\", \"brightkite_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n",
" # os.remove(os.path.join(\"dataTMP\", \"brightkite\", file))\n",
"\n",
" # for file in os.listdir(os.path.join(\"dataTMP\", \"gowalla\")):\n",
" # if file.endswith(\"_totalCheckins.txt\"):\n",
" # df = pd.read_csv(os.path.join(\"dataTMP\", \"gowalla\", file), sep=\"\\t\", header=None, names=[\"user_id\", \"check-in time\", \"latitude\", \"longitude\", \"venue_id\"])\n",
" # df[\"check-in time\"] = pd.to_datetime(df[\"check-in time\"])\n",
" # df = df[df[\"check-in time\"].dt.year == 2010]\n",
" # df = df.drop([\"check-in time\", \"latitude\", \"longitude\"], axis=1)\n",
" # df.to_csv(os.path.join(\"dataTMP\", \"gowalla\", \"gowalla_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n",
" # os.remove(os.path.join(\"dataTMP\", \"gowalla\", file))\n",
"\n",
" # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\")):\n",
" # if file.endswith(\"dataTMPset_WWW_Checkins_anonymized.txt\"):\n",
" # df = pd.read_csv(os.path.join(\"dataTMP\", \"foursquare\", file), sep=\"\\t\", header=None)\n",
" # df = df[[0, 1]]\n",
" # df.to_csv(os.path.join(\"dataTMP\", \"foursquare\", \"foursquare_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n",
" # os.remove(os.path.join(\"dataTMP\", \"foursquare\", file))\n"
]
}
],