small fixes

main
Luca Lombardo 2 years ago
parent 1ad3590acb
commit 303ad5ea0d

@ -32,7 +32,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# import the graphs from the saved files\n", "# import the graphs from the saved files. NOT TO BE INCLUDED IN THE FINAL NOTEBOOK\n",
"G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n", "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
"G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n", "G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n",
"G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n", "G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n",
@ -48,7 +48,7 @@
"\n", "\n",
"graphs_all = checkins_graphs + friendships_graph\n", "graphs_all = checkins_graphs + friendships_graph\n",
"\n", "\n",
"analysis_results = pd.read_pickle('analysis_results_acc.pkl')\n" "analysis_results = pd.read_pickle('analysis_results.pkl')\n"
] ]
}, },
{ {
@ -70,7 +70,7 @@
"\n", "\n",
"Among the types of centrality that have been considered in the literature, many have to do with distances between nodes. Take, for instance, a node in an undirected connected network: if the sum of distances to all other nodes is large, the node under consideration is peripheral; this is the starting point to define Bavelas's closeness centrality \\cite{closeness}, which is the reciprocal of peripherality (i.e., the reciprocal of the sum of distances to all other nodes). \n", "Among the types of centrality that have been considered in the literature, many have to do with distances between nodes. Take, for instance, a node in an undirected connected network: if the sum of distances to all other nodes is large, the node under consideration is peripheral; this is the starting point to define Bavelas's closeness centrality \\cite{closeness}, which is the reciprocal of peripherality (i.e., the reciprocal of the sum of distances to all other nodes). \n",
"\n", "\n",
"The role played by shortest paths is justified by one of the most well-known features of complex networks, the so-called small-world phenomenon. A small-world network is a graph where the average distance between nodes is logarithmic in the size of the network, whereas the clustering coefficient is larger (that is, neighborhoods tend to be denser) than in a random Erdős-Rényi graph with the same size and average distance. The fact that social networks (whether electronically mediated or not) exhibit the small-world property is known at least since Milgram's famous experiment \\cite{} and is arguably the most popular of all features of complex networks. For instance, the average distance of the Facebook graph was recently established to be just $4.74$.\n" "The role played by shortest paths is justified by one of the most well-known features of complex networks, the so-called small-world phenomenon. A small-world network is a graph where the average distance between nodes is logarithmic in the size of the network, whereas the clustering coefficient is larger (that is, neighborhoods tend to be denser) than in a random Erdős-Rényi graph with the same size and average distance. The fact that social networks (whether electronically mediated or not) exhibit the small-world property is known at least since Milgram's famous experiment and is arguably the most popular of all features of complex networks. For instance, the average distance of the Facebook graph was recently established to be just $4.74$.\n"
] ]
}, },
{ {
@ -473,8 +473,7 @@
"\n", "\n",
"The degree distribution, $P(k)$, is the fraction of sites having degree $k$. We know from the literature that many real networks do not exhibit a Poisson degree distribution, as predicted in the ER model. In fact, many of them exhibit a distribution with a long, power-law, tail, $P(k) \\sim k^{-\\gamma}$ with some $γ$, usually between $2$ and 3$.\n", "The degree distribution, $P(k)$, is the fraction of sites having degree $k$. We know from the literature that many real networks do not exhibit a Poisson degree distribution, as predicted in the ER model. In fact, many of them exhibit a distribution with a long, power-law, tail, $P(k) \\sim k^{-\\gamma}$ with some $γ$, usually between $2$ and 3$.\n",
"\n", "\n",
"For know, we will just compute the average degree of our networks and add it to the dataframe.\n", "For know, we will just compute the average degree of our networks and add it to the dataframe."
"\n"
] ]
}, },
{ {
@ -981,7 +980,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"for G in checkins_graphs:\n", "for G in checkins_graphs:\n",
" degree_distribution(G)" " degree_distribution(G, log=True)"
] ]
}, },
{ {
@ -1020,7 +1019,7 @@
" print(G.name)\n", " print(G.name)\n",
" print(\"Number of nodes: \", G.number_of_nodes())\n", " print(\"Number of nodes: \", G.number_of_nodes())\n",
" print(\"Number of edges: \", G.number_of_edges())\n", " print(\"Number of edges: \", G.number_of_edges())\n",
" degree_distribution(G)" " degree_distribution(G, log=False)"
] ]
}, },
{ {
@ -1035,7 +1034,7 @@
" print(G.name)\n", " print(G.name)\n",
" print(\"Number of nodes: \", G.number_of_nodes())\n", " print(\"Number of nodes: \", G.number_of_nodes())\n",
" print(\"Number of edges: \", G.number_of_edges())\n", " print(\"Number of edges: \", G.number_of_edges())\n",
" degree_distribution(G)" " degree_distribution(G, log=False)"
] ]
}, },
{ {
@ -1051,13 +1050,17 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"The degree distribution alone is not enough to characterize the network. There are many other quantities, such as the degree-degree correlation (between connected nodes), the spatial correlations, the clustering coefficient, the betweenness or central-ity distribution, and the self-similarity exponents." "The degree distribution alone is not enough to characterize the network. There are many other quantities, such as the degree-degree correlation (between connected nodes), the spatial correlations, the clustering coefficient, the betweenness or central-ity distribution, and the self-similarity exponents.\n",
"\n",
"--- \n",
"\n",
"Now let's try to compute the same analysis made before for this random models"
] ]
} }
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3.10.6 64-bit", "display_name": "Python 3.10.8 64-bit",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -1071,12 +1074,12 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]" "version": "3.10.8"
}, },
"orig_nbformat": 4, "orig_nbformat": 4,
"vscode": { "vscode": {
"interpreter": { "interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
} }
} }
}, },

@ -28,170 +28,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Graph</th>\n",
" <th>Number of Nodes</th>\n",
" <th>Number of Edges</th>\n",
" <th>Average Degree</th>\n",
" <th>Average Clustering Coefficient</th>\n",
" <th>log N</th>\n",
" <th>Average Shortest Path Length</th>\n",
" <th>betweenness centrality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Brightkite Checkins Graph</td>\n",
" <td>7191</td>\n",
" <td>3663807</td>\n",
" <td>1018.997914</td>\n",
" <td>0.702854</td>\n",
" <td>8.880586</td>\n",
" <td>2.411011</td>\n",
" <td>0.00022</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Gowalla Checkins Graph</td>\n",
" <td>10702</td>\n",
" <td>303104</td>\n",
" <td>56.644366</td>\n",
" <td>0.505597</td>\n",
" <td>9.278186</td>\n",
" <td>5.222903</td>\n",
" <td>0.000301</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Foursquare EU Checkins Graph</td>\n",
" <td>20282</td>\n",
" <td>7430376</td>\n",
" <td>732.706439</td>\n",
" <td>0.597097</td>\n",
" <td>9.917489</td>\n",
" <td>2.2843</td>\n",
" <td>0.000089</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Foursquare IT Checkins Graph</td>\n",
" <td>3730</td>\n",
" <td>629749</td>\n",
" <td>337.667024</td>\n",
" <td>0.683565</td>\n",
" <td>8.224164</td>\n",
" <td>2.185477</td>\n",
" <td>0.000428</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Brightkite Friendship Graph</td>\n",
" <td>5928</td>\n",
" <td>34673</td>\n",
" <td>11.698043</td>\n",
" <td>0.219749</td>\n",
" <td>8.687442</td>\n",
" <td>5.052162</td>\n",
" <td>0.000448</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>(Filtered) Gowalla Friendship Graph</td>\n",
" <td>8396</td>\n",
" <td>29122</td>\n",
" <td>6.937113</td>\n",
" <td>0.217544</td>\n",
" <td>9.035511</td>\n",
" <td>4.558532</td>\n",
" <td>0.000357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Foursquare IT Friendship Graph</td>\n",
" <td>2073</td>\n",
" <td>6217</td>\n",
" <td>5.99807</td>\n",
" <td>0.148489</td>\n",
" <td>7.636752</td>\n",
" <td>19.530752</td>\n",
" <td>0.000879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Foursquare EU Friendship Graph</td>\n",
" <td>16491</td>\n",
" <td>59419</td>\n",
" <td>7.206234</td>\n",
" <td>0.167946</td>\n",
" <td>9.710570</td>\n",
" <td>23.713864</td>\n",
" <td>0.000272</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Graph Number of Nodes Number of Edges \\\n",
"0 Brightkite Checkins Graph 7191 3663807 \n",
"1 Gowalla Checkins Graph 10702 303104 \n",
"2 Foursquare EU Checkins Graph 20282 7430376 \n",
"3 Foursquare IT Checkins Graph 3730 629749 \n",
"4 Brightkite Friendship Graph 5928 34673 \n",
"5 (Filtered) Gowalla Friendship Graph 8396 29122 \n",
"6 Foursquare IT Friendship Graph 2073 6217 \n",
"7 Foursquare EU Friendship Graph 16491 59419 \n",
"\n",
" Average Degree Average Clustering Coefficient log N \\\n",
"0 1018.997914 0.702854 8.880586 \n",
"1 56.644366 0.505597 9.278186 \n",
"2 732.706439 0.597097 9.917489 \n",
"3 337.667024 0.683565 8.224164 \n",
"4 11.698043 0.219749 8.687442 \n",
"5 6.937113 0.217544 9.035511 \n",
"6 5.99807 0.148489 7.636752 \n",
"7 7.206234 0.167946 9.710570 \n",
"\n",
" Average Shortest Path Length betweenness centrality \n",
"0 2.411011 0.00022 \n",
"1 5.222903 0.000301 \n",
"2 2.2843 0.000089 \n",
"3 2.185477 0.000428 \n",
"4 5.052162 0.000448 \n",
"5 4.558532 0.000357 \n",
"6 19.530752 0.000879 \n",
"7 23.713864 0.000272 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# import the graphs from the saved files\n", "# import the graphs from the saved files\n",
"G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n", "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
@ -204,9 +43,9 @@
"G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n", "G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n",
"G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n", "G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n",
"\n", "\n",
"# open the dataframe object\n", "# # open the dataframe object\n",
"analysis_results = pd.read_pickle('analysis_results.pkl')\n", "# analysis_results = pd.read_pickle('analysis_results.pkl')\n",
"analysis_results" "# analysis_results"
] ]
}, },
{ {
@ -216,11 +55,286 @@
"source": [ "source": [
"The first thing that we want to do is very simple, create a random reference for each graph" "The first thing that we want to do is very simple, create a random reference for each graph"
] ]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
"friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
"\n",
"graphs_all = checkins_graphs + friendships_graph"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Original Graphs\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for graph in graphs_all:\n",
" # add basic graph statistics\n",
" analysis_results = analysis_results.append(\n",
" {'Graph': graph.name, \n",
" 'Number of Nodes': graph.number_of_nodes(), \n",
" 'log N': np.log(graph.number_of_nodes()),\n",
" 'Number of Edges': graph.number_of_edges()}, \n",
" ignore_index=True)\n",
"\n",
" # add average degree\n",
" print(\"Computing average degree for graph: \", graph.name)\n",
" avg_deg = np.mean([d for n, d in graph.degree()])\n",
" analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Degree'] = avg_deg\n",
"\n",
" # add average clustering coefficient\n",
" print(\"Computing average clustering coefficient for graph: \", graph.name)\n",
" avg_clustering = nx.average_clustering(graph)\n",
" analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Clustering Coefficient'] = avg_clustering\n",
"\n",
" # add average shortest path length\n",
" print(\"Computing average shortest path length for graph: \", graph.name)\n",
" average_shortest_path_length = average_shortest_path(graph)\n",
" analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
"\n",
" # add betweenness centrality\n",
" print(\"Computing betweenness centrality for graph: \", graph.name)\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6).values()))\n",
" analysis_results.loc[analysis_results['Graph'] == graph.name, 'betweenness centrality'] = betweenness_centrality\n",
" print()\n",
"\n",
"\n",
"analysis_results\n",
"analysis_results.to_pickle('analysis_results.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Random shit"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'name'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m 6\u001b[0m G \u001b[38;5;241m=\u001b[39m create_random_graphs(graph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merods\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# add the basic information to the dataframe\u001b[39;00m\n\u001b[1;32m 9\u001b[0m analysis_results_erods \u001b[38;5;241m=\u001b[39m analysis_results_erods\u001b[38;5;241m.\u001b[39mappend({\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mG\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m,\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Nodes\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_nodes(),\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Edges\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_edges(),\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog N\u001b[39m\u001b[38;5;124m'\u001b[39m: np\u001b[38;5;241m.\u001b[39mlog(G\u001b[38;5;241m.\u001b[39mnumber_of_nodes())\n\u001b[1;32m 14\u001b[0m }, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# compute the average degree and add it to the dataframe\u001b[39;00m\n\u001b[1;32m 17\u001b[0m avg_deg \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmean([d \u001b[38;5;28;01mfor\u001b[39;00m n, d \u001b[38;5;129;01min\u001b[39;00m G\u001b[38;5;241m.\u001b[39mdegree()])\n",
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'name'"
]
}
],
"source": [
"analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"# read all the graphs gpickle files in the data/random/erdos folder. Then run the same analysis as before for this graphs\n",
"\n",
"for graph in graphs_all:\n",
" G = create_random_graphs(graph, \"erods\")\n",
"\n",
" # add the basic information to the dataframe\n",
" analysis_results_erods = analysis_results_erods.append({\n",
" 'Graph': G.name,\n",
" 'Number of Nodes': G.number_of_nodes(),\n",
" 'Number of Edges': G.number_of_edges(),\n",
" 'log N': np.log(G.number_of_nodes())\n",
" }, ignore_index=True)\n",
"\n",
" # compute the average degree and add it to the dataframe\n",
" avg_deg = np.mean([d for n, d in G.degree()])\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Degree'] = avg_deg\n",
"\n",
" # compute the average clustering coefficient and add it to the dataframe\n",
" avg_clustering = nx.average_clustering(G)\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n",
"\n",
" # compute the average shortest path length and add it to the dataframe\n",
" average_shortest_path_length = average_shortest_path(G)\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
"\n",
" # compute the betweenness centrality and add it to the dataframe\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n",
"\n",
" # save memory\n",
" del G\n",
"\n",
"analysis_results_erods.to_pickle('analysis_results_erods.pkl')\n",
"analysis_results_erods\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\tNumber of edges in the original graph: 3663807\n",
"\tNumber of edges in the random graph: 3660219\n"
]
},
{
"ename": "UnboundLocalError",
"evalue": "local variable 'G_copy' referenced before assignment",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 25\u001b[0m\n\u001b[1;32m 22\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Clustering Coefficient\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m avg_clustering\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# compute the average shortest path length and add it to the dataframe\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m average_shortest_path_length \u001b[38;5;241m=\u001b[39m \u001b[43maverage_shortest_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Shortest Path Length\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m average_shortest_path_length\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# compute the betweenness centrality and add it to the dataframe\u001b[39;00m\n",
"File \u001b[0;32m~/github/small-worlds/utils.py:497\u001b[0m, in \u001b[0;36maverage_shortest_path\u001b[0;34m(G, k)\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m\\t\u001b[39;00m\u001b[39mNumber of edges after removing \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m% o\u001b[39;00m\u001b[39mf nodes: \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m.\u001b[39mformat((k)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m, G_copy\u001b[39m.\u001b[39mnumber_of_edges()))\n\u001b[1;32m 496\u001b[0m tmp \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m--> 497\u001b[0m connected_components \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(nx\u001b[39m.\u001b[39mconnected_components(G_copy))\n\u001b[1;32m 498\u001b[0m \u001b[39m# remove all the connected components with less than 10 nodes\u001b[39;00m\n\u001b[1;32m 499\u001b[0m connected_components \u001b[39m=\u001b[39m [c \u001b[39mfor\u001b[39;00m c \u001b[39min\u001b[39;00m connected_components \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(c) \u001b[39m>\u001b[39m \u001b[39m10\u001b[39m]\n",
"\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'G_copy' referenced before assignment"
]
}
],
"source": [
"# do the same with the watts strogatz graphs\n",
"\n",
"analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"for graph in graphs_all:\n",
" G = create_random_graphs(graph, 'watts_strogatz', save=False)\n",
"\n",
" # add the basic information to the dataframe\n",
" analysis_results_ws = analysis_results_ws.append({\n",
" 'Graph': G.name,\n",
" 'Number of Nodes': G.number_of_nodes(),\n",
" 'Number of Edges': G.number_of_edges(),\n",
" 'log N': np.log(G.number_of_nodes())\n",
" }, ignore_index=True)\n",
"\n",
" # compute the average degree and add it to the dataframe\n",
" avg_deg = np.mean([d for n, d in G.degree()])\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Degree'] = avg_deg\n",
"\n",
" # compute the average clustering coefficient and add it to the dataframe\n",
" avg_clustering = nx.average_clustering(G)\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n",
"\n",
" # compute the average shortest path length and add it to the dataframe\n",
" average_shortest_path_length = average_shortest_path(G)\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
"\n",
" # compute the betweenness centrality and add it to the dataframe\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n",
"\n",
" # save memory\n",
" del G\n",
"\n",
"analysis_results_ws.to_pickle('analysis_results_ws.pkl')\n",
"analysis_results_ws"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G = nx.watts_strogatz_graph(1000, 4, 0.1)\n",
"adj = nx.to_scipy_sparse_array(G)\n",
"# print info about the graph and the matrix\n",
"print(\"Number of nodes: \", G.number_of_nodes())\n",
"print(\"Number of edges: \", G.number_of_edges())\n",
"print(\"Average degree: \", np.mean([d for n, d in G.degree()]))\n",
"print(\"Average clustering coefficient: \", nx.average_clustering(G))\n",
"print(\"Average shortest path length: \", nx.average_shortest_path_length(G))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import scipy.sparse as sp\n",
"\n",
"# randomly swap edges, but keep the degree of each node the same (i.e. the degree sequence is preserved)\n",
"def random_swap_edges(adj, nswap=1, max_tries=100):\n",
" # use numpy and scipy to speed up the process\n",
" adj = sp.csr_matrix(adj)\n",
" n, m = adj.shape \n",
" assert n == m # make sure the adjacency matrix is square\n",
" adj_triu = sp.triu(adj) # only consider the upper triangular part of the adjacency matrix\n",
" adj_tuple = sp.find(adj_triu) # get the indices and values of the non-zero elements\n",
" adj_edges = np.array(list(zip(adj_tuple[0], adj_tuple[1]))) # get the edges\n",
" adj_data = adj_tuple[2] # get the edge weights\n",
" nnz = adj_edges.shape[0] # number of non-zero elements\n",
" assert nnz == adj_data.shape[0] # make sure the number of edges and edge weights are the same\n",
" for _ in range(nswap): # repeat nswap times\n",
" # choose random edges to swap\n",
" edge_idx = np.random.choice(nnz, size=2, replace=False) # choose two random edges\n",
" edge1 = adj_edges[edge_idx[0]] # get the first edge\n",
" edge2 = adj_edges[edge_idx[1]] # get the second edge\n",
" # make sure the edges are not self-loops and not already connected\n",
" if edge1[0] == edge2[0] or edge1[0] == edge2[1] or edge1[1] == edge2[0] or edge1[1] == edge2[1] or adj[edge1[0], edge2[1]] or adj[edge2[0], edge1[1]]: \n",
" continue # if the edges are self-loops or already connected, try again\n",
" # swap the edges\n",
" adj[edge1[0], edge1[1]] = 0 \n",
" adj[edge2[0], edge2[1]] = 0 \n",
" adj[edge1[0], edge2[1]] = 1\n",
" adj[edge2[0], edge1[1]] = 1\n",
" # update adj_edges and adj_data\n",
" adj_edges[edge_idx[0]] = [edge1[0], edge2[1]]\n",
" adj_edges[edge_idx[1]] = [edge2[0], edge1[1]]\n",
" adj_data[edge_idx[0]] = 1\n",
" adj_data[edge_idx[1]] = 1\n",
" return adj\n",
"\n",
"adj_swapped = random_swap_edges(adj, nswap=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create a new graph from the swapped adjacency matrix\n",
"G_swapped = nx.from_scipy_sparse_matrix(adj_swapped)\n",
"# print info about the graph and the matrix\n",
"print(\"Number of nodes: \", G_swapped.number_of_nodes())\n",
"print(\"Number of edges: \", G_swapped.number_of_edges())\n",
"print(\"Average degree: \", np.mean([d for n, d in G_swapped.degree()]))\n",
"print(\"Average clustering coefficient: \", nx.average_clustering(G_swapped))\n",
"print(\"Average shortest path length: \", nx.average_shortest_path_length(G_swapped))"
]
} }
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3.10.8 64-bit",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -234,12 +348,12 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.10.8"
}, },
"orig_nbformat": 4, "orig_nbformat": 4,
"vscode": { "vscode": {
"interpreter": { "interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
} }
} }
}, },

@ -430,6 +430,9 @@ def betweenness_centrality_parallel(G, processes=None, k =None) -> dict:
print("\tNumber of nodes after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_nodes())) print("\tNumber of nodes after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_nodes()))
print("\tNumber of edges after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_edges())) print("\tNumber of edges after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_edges()))
if k is None:
G_copy = G.copy()
p = Pool(processes=processes) p = Pool(processes=processes)
node_divisor = len(p._pool) * 4 node_divisor = len(p._pool) * 4
node_chunks = list(chunks(G_copy.nodes(), G_copy.order() // node_divisor)) node_chunks = list(chunks(G_copy.nodes(), G_copy.order() // node_divisor))
@ -481,6 +484,7 @@ def average_shortest_path(G: nx.Graph, k=None) -> float:
if k is not None and (k < 0 or k > 1): if k is not None and (k < 0 or k > 1):
raise ValueError("k must be between 0 and 1") raise ValueError("k must be between 0 and 1")
elif k is None: elif k is None:
G = G.copy()
connected_components = list(nx.connected_components(G)) connected_components = list(nx.connected_components(G))
else: else:
G_copy = G.copy() G_copy = G.copy()
@ -567,7 +571,7 @@ def create_random_graphs(G: nx.Graph, model = None, save = True) -> nx.Graph:
G_random = nx.erdos_renyi_graph(G.number_of_nodes(), nx.density(G)) G_random = nx.erdos_renyi_graph(G.number_of_nodes(), nx.density(G))
print("\tNumber of edges in the original graph: {}" .format(G.number_of_edges())) print("\tNumber of edges in the original graph: {}" .format(G.number_of_edges()))
print("\tNumber of edges in the random graph: {}" .format(G_random.number_of_edges())) print("\tNumber of edges in the random graph: {}" .format(G_random.number_of_edges()))
G_random.name = G.name + " erdos" G_random.name = G.name + " Erdos-Renyi"
if save: if save:
# check if the folder exists, otherwise create it # check if the folder exists, otherwise create it
@ -585,7 +589,7 @@ def create_random_graphs(G: nx.Graph, model = None, save = True) -> nx.Graph:
G_random = nx.watts_strogatz_graph(G.number_of_nodes(), avg_degree, p) G_random = nx.watts_strogatz_graph(G.number_of_nodes(), avg_degree, p)
print("\tNumber of edges in the original graph: {}" .format(G.number_of_edges())) print("\tNumber of edges in the original graph: {}" .format(G.number_of_edges()))
print("\tNumber of edges in the random graph: {}" .format(G_random.number_of_edges())) print("\tNumber of edges in the random graph: {}" .format(G_random.number_of_edges()))
G_random.name = G.name + " watts_strogatz" G_random.name = G.name + " Watts-Strogatz"
if save: if save:
# check if the folder exists, otherwise create it # check if the folder exists, otherwise create it

Loading…
Cancel
Save