small fixes

2 years ago · 303ad5ea0d
parent 1ad3590acb
commit 303ad5ea0d
3 changed files with 304 additions and 183 deletions
--- a/main.ipynb
+++ b/main.ipynb
@ -32,7 +32,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# import the graphs from the saved files\n",
+    "# import the graphs from the saved files. NOT TO BE INCLUDED IN THE FINAL NOTEBOOK\n",
    "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
    "G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n",
    "G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n",
@ -48,7 +48,7 @@
    "\n",
    "graphs_all = checkins_graphs + friendships_graph\n",
    "\n",
-    "analysis_results = pd.read_pickle('analysis_results_acc.pkl')\n"
+    "analysis_results = pd.read_pickle('analysis_results.pkl')\n"
   ]
  },
  {
@ -70,7 +70,7 @@
    "\n",
    "Among the types of centrality that have been considered in the literature, many have to do with distances between nodes. Take, for instance, a node in an undirected connected network: if the sum of distances to all other nodes is large, the node under consideration is peripheral; this is the starting point to define Bavelas's closeness centrality \\cite{closeness}, which is the reciprocal of peripherality (i.e., the reciprocal of the sum of distances to all other nodes). \n",
    "\n",
-    "The role played by shortest paths is justified by one of the most well-known features of complex networks, the so-called small-world phenomenon. A small-world network is a graph where the average distance between nodes is logarithmic in the size of the network, whereas the clustering coefficient is larger (that is, neighborhoods tend to be denser) than in a random Erdős-Rényi graph with the same size and average distance. The fact that social networks (whether electronically mediated or not) exhibit the small-world property is known at least since Milgram's famous experiment \\cite{} and is arguably the most popular of all features of complex networks. For instance, the average distance of the Facebook graph was recently established to be just $4.74$.\n"
+    "The role played by shortest paths is justified by one of the most well-known features of complex networks, the so-called small-world phenomenon. A small-world network is a graph where the average distance between nodes is logarithmic in the size of the network, whereas the clustering coefficient is larger (that is, neighborhoods tend to be denser) than in a random Erdős-Rényi graph with the same size and average distance. The fact that social networks (whether electronically mediated or not) exhibit the small-world property is known at least since Milgram's famous experiment and is arguably the most popular of all features of complex networks. For instance, the average distance of the Facebook graph was recently established to be just $4.74$.\n"
   ]
  },
  {
@ -473,8 +473,7 @@
    "\n",
    "The degree distribution, $P(k)$, is the fraction of sites having degree $k$. We know from the literature that many real networks do not exhibit a Poisson degree distribution, as predicted in the ER model. In fact, many of them exhibit a distribution with a long, power-law, tail, $P(k) \\sim k^{-\\gamma}$ with some $γ$, usually between $2$ and 3$.\n",
    "\n",
-    "For know, we will just compute the average degree of our networks and add it to the dataframe.\n",
-    "\n"
+    "For know, we will just compute the average degree of our networks and add it to the dataframe."
   ]
  },
  {
@ -981,7 +980,7 @@
   "outputs": [],
   "source": [
    "for G in checkins_graphs:\n",
-    "    degree_distribution(G)"
+    "    degree_distribution(G, log=True)"
   ]
  },
  {
@ -1020,7 +1019,7 @@
    "    print(G.name)\n",
    "    print(\"Number of nodes: \", G.number_of_nodes())\n",
    "    print(\"Number of edges: \", G.number_of_edges())\n",
-    "    degree_distribution(G)"
+    "    degree_distribution(G, log=False)"
   ]
  },
  {
@ -1035,7 +1034,7 @@
    "    print(G.name)\n",
    "    print(\"Number of nodes: \", G.number_of_nodes())\n",
    "    print(\"Number of edges: \", G.number_of_edges())\n",
-    "    degree_distribution(G)"
+    "    degree_distribution(G, log=False)"
   ]
  },
  {
@ -1051,13 +1050,17 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "The degree distribution alone is not enough to characterize the network. There are many other quantities, such as the degree-degree correlation (between connected nodes), the spatial correlations, the clustering coefficient, the betweenness or central-ity distribution, and the self-similarity exponents."
+    "The degree distribution alone is not enough to characterize the network. There are many other quantities, such as the degree-degree correlation (between connected nodes), the spatial correlations, the clustering coefficient, the betweenness or central-ity distribution, and the self-similarity exponents.\n",
+    "\n",
+    "--- \n",
+    "\n",
+    "Now let's try to compute the same analysis made before for this random models"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3.10.6 64-bit",
+   "display_name": "Python 3.10.8 64-bit",
   "language": "python",
   "name": "python3"
  },
@ -1071,12 +1074,12 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]"
+   "version": "3.10.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
-    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
--- a/testing.ipynb
+++ b/testing.ipynb
@ -28,170 +28,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Graph</th>\n",
-       "      <th>Number of Nodes</th>\n",
-       "      <th>Number of Edges</th>\n",
-       "      <th>Average Degree</th>\n",
-       "      <th>Average Clustering Coefficient</th>\n",
-       "      <th>log N</th>\n",
-       "      <th>Average Shortest Path Length</th>\n",
-       "      <th>betweenness centrality</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Brightkite Checkins Graph</td>\n",
-       "      <td>7191</td>\n",
-       "      <td>3663807</td>\n",
-       "      <td>1018.997914</td>\n",
-       "      <td>0.702854</td>\n",
-       "      <td>8.880586</td>\n",
-       "      <td>2.411011</td>\n",
-       "      <td>0.00022</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Gowalla Checkins Graph</td>\n",
-       "      <td>10702</td>\n",
-       "      <td>303104</td>\n",
-       "      <td>56.644366</td>\n",
-       "      <td>0.505597</td>\n",
-       "      <td>9.278186</td>\n",
-       "      <td>5.222903</td>\n",
-       "      <td>0.000301</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Foursquare EU Checkins Graph</td>\n",
-       "      <td>20282</td>\n",
-       "      <td>7430376</td>\n",
-       "      <td>732.706439</td>\n",
-       "      <td>0.597097</td>\n",
-       "      <td>9.917489</td>\n",
-       "      <td>2.2843</td>\n",
-       "      <td>0.000089</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Foursquare IT Checkins Graph</td>\n",
-       "      <td>3730</td>\n",
-       "      <td>629749</td>\n",
-       "      <td>337.667024</td>\n",
-       "      <td>0.683565</td>\n",
-       "      <td>8.224164</td>\n",
-       "      <td>2.185477</td>\n",
-       "      <td>0.000428</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Brightkite Friendship Graph</td>\n",
-       "      <td>5928</td>\n",
-       "      <td>34673</td>\n",
-       "      <td>11.698043</td>\n",
-       "      <td>0.219749</td>\n",
-       "      <td>8.687442</td>\n",
-       "      <td>5.052162</td>\n",
-       "      <td>0.000448</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>(Filtered) Gowalla Friendship Graph</td>\n",
-       "      <td>8396</td>\n",
-       "      <td>29122</td>\n",
-       "      <td>6.937113</td>\n",
-       "      <td>0.217544</td>\n",
-       "      <td>9.035511</td>\n",
-       "      <td>4.558532</td>\n",
-       "      <td>0.000357</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>Foursquare IT Friendship Graph</td>\n",
-       "      <td>2073</td>\n",
-       "      <td>6217</td>\n",
-       "      <td>5.99807</td>\n",
-       "      <td>0.148489</td>\n",
-       "      <td>7.636752</td>\n",
-       "      <td>19.530752</td>\n",
-       "      <td>0.000879</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>Foursquare EU Friendship Graph</td>\n",
-       "      <td>16491</td>\n",
-       "      <td>59419</td>\n",
-       "      <td>7.206234</td>\n",
-       "      <td>0.167946</td>\n",
-       "      <td>9.710570</td>\n",
-       "      <td>23.713864</td>\n",
-       "      <td>0.000272</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                 Graph Number of Nodes Number of Edges  \\\n",
-       "0            Brightkite Checkins Graph            7191         3663807   \n",
-       "1               Gowalla Checkins Graph           10702          303104   \n",
-       "2         Foursquare EU Checkins Graph           20282         7430376   \n",
-       "3         Foursquare IT Checkins Graph            3730          629749   \n",
-       "4          Brightkite Friendship Graph            5928           34673   \n",
-       "5  (Filtered) Gowalla Friendship Graph            8396           29122   \n",
-       "6       Foursquare IT Friendship Graph            2073            6217   \n",
-       "7       Foursquare EU Friendship Graph           16491           59419   \n",
-       "\n",
-       "  Average Degree Average Clustering Coefficient     log N  \\\n",
-       "0    1018.997914                       0.702854  8.880586   \n",
-       "1      56.644366                       0.505597  9.278186   \n",
-       "2     732.706439                       0.597097  9.917489   \n",
-       "3     337.667024                       0.683565  8.224164   \n",
-       "4      11.698043                       0.219749  8.687442   \n",
-       "5       6.937113                       0.217544  9.035511   \n",
-       "6        5.99807                       0.148489  7.636752   \n",
-       "7       7.206234                       0.167946  9.710570   \n",
-       "\n",
-       "  Average Shortest Path Length betweenness centrality  \n",
-       "0                     2.411011                0.00022  \n",
-       "1                     5.222903               0.000301  \n",
-       "2                       2.2843               0.000089  \n",
-       "3                     2.185477               0.000428  \n",
-       "4                     5.052162               0.000448  \n",
-       "5                     4.558532               0.000357  \n",
-       "6                    19.530752               0.000879  \n",
-       "7                    23.713864               0.000272  "
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# import the graphs from the saved files\n",
    "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
@ -204,9 +43,9 @@
    "G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n",
    "G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n",
    "\n",
-    "# open the dataframe object\n",
-    "analysis_results = pd.read_pickle('analysis_results.pkl')\n",
-    "analysis_results"
+    "# # open the dataframe object\n",
+    "# analysis_results = pd.read_pickle('analysis_results.pkl')\n",
+    "# analysis_results"
   ]
  },
  {
@ -216,11 +55,286 @@
   "source": [
    "The first thing that we want to do is very simple, create a random reference for each graph"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
+    "\n",
+    "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
+    "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
+    "\n",
+    "graphs_all = checkins_graphs + friendships_graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Original Graphs\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for graph in graphs_all:\n",
+    "    # add basic graph statistics\n",
+    "    analysis_results = analysis_results.append(\n",
+    "        {'Graph': graph.name, \n",
+    "        'Number of Nodes': graph.number_of_nodes(), \n",
+    "        'log N': np.log(graph.number_of_nodes()),\n",
+    "        'Number of Edges': graph.number_of_edges()}, \n",
+    "        ignore_index=True)\n",
+    "\n",
+    "    # add average degree\n",
+    "    print(\"Computing average degree for graph: \", graph.name)\n",
+    "    avg_deg = np.mean([d for n, d in graph.degree()])\n",
+    "    analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Degree'] = avg_deg\n",
+    "\n",
+    "    # add average clustering coefficient\n",
+    "    print(\"Computing average clustering coefficient for graph: \", graph.name)\n",
+    "    avg_clustering = nx.average_clustering(graph)\n",
+    "    analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Clustering Coefficient'] = avg_clustering\n",
+    "\n",
+    "    # add average shortest path length\n",
+    "    print(\"Computing average shortest path length for graph: \", graph.name)\n",
+    "    average_shortest_path_length = average_shortest_path(graph)\n",
+    "    analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
+    "\n",
+    "    # add betweenness centrality\n",
+    "    print(\"Computing betweenness centrality for graph: \", graph.name)\n",
+    "    betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6).values()))\n",
+    "    analysis_results.loc[analysis_results['Graph'] == graph.name, 'betweenness centrality'] = betweenness_centrality\n",
+    "    print()\n",
+    "\n",
+    "\n",
+    "analysis_results\n",
+    "analysis_results.to_pickle('analysis_results.pkl')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Random shit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'NoneType' object has no attribute 'name'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m      6\u001b[0m G \u001b[38;5;241m=\u001b[39m create_random_graphs(graph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merods\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      8\u001b[0m \u001b[38;5;66;03m# add the basic information to the dataframe\u001b[39;00m\n\u001b[1;32m      9\u001b[0m analysis_results_erods \u001b[38;5;241m=\u001b[39m analysis_results_erods\u001b[38;5;241m.\u001b[39mappend({\n\u001b[0;32m---> 10\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mG\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m,\n\u001b[1;32m     11\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Nodes\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_nodes(),\n\u001b[1;32m     12\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Edges\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_edges(),\n\u001b[1;32m     13\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog N\u001b[39m\u001b[38;5;124m'\u001b[39m: np\u001b[38;5;241m.\u001b[39mlog(G\u001b[38;5;241m.\u001b[39mnumber_of_nodes())\n\u001b[1;32m     14\u001b[0m }, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# compute the average degree and add it to the dataframe\u001b[39;00m\n\u001b[1;32m     17\u001b[0m avg_deg \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmean([d \u001b[38;5;28;01mfor\u001b[39;00m n, d \u001b[38;5;129;01min\u001b[39;00m G\u001b[38;5;241m.\u001b[39mdegree()])\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'name'"
+     ]
+    }
+   ],
+   "source": [
+    "analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
+    "\n",
+    "# read all the graphs gpickle files in the data/random/erdos folder. Then run the same analysis as before for this graphs\n",
+    "\n",
+    "for graph in graphs_all:\n",
+    "    G = create_random_graphs(graph, \"erods\")\n",
+    "\n",
+    "    # add the basic information to the dataframe\n",
+    "    analysis_results_erods = analysis_results_erods.append({\n",
+    "        'Graph': G.name,\n",
+    "        'Number of Nodes': G.number_of_nodes(),\n",
+    "        'Number of Edges': G.number_of_edges(),\n",
+    "        'log N': np.log(G.number_of_nodes())\n",
+    "    }, ignore_index=True)\n",
+    "\n",
+    "    # compute the average degree and add it to the dataframe\n",
+    "    avg_deg = np.mean([d for n, d in G.degree()])\n",
+    "    analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Degree'] = avg_deg\n",
+    "\n",
+    "    # compute the average clustering coefficient and add it to the dataframe\n",
+    "    avg_clustering = nx.average_clustering(G)\n",
+    "    analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n",
+    "\n",
+    "    # compute the average shortest path length and add it to the dataframe\n",
+    "    average_shortest_path_length = average_shortest_path(G)\n",
+    "    analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
+    "\n",
+    "    # compute the betweenness centrality and add it to the dataframe\n",
+    "    betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
+    "    analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n",
+    "\n",
+    "    # save memory\n",
+    "    del G\n",
+    "\n",
+    "analysis_results_erods.to_pickle('analysis_results_erods.pkl')\n",
+    "analysis_results_erods\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\tNumber of edges in the original graph: 3663807\n",
+      "\tNumber of edges in the random graph: 3660219\n"
+     ]
+    },
+    {
+     "ename": "UnboundLocalError",
+     "evalue": "local variable 'G_copy' referenced before assignment",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mUnboundLocalError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 25\u001b[0m\n\u001b[1;32m     22\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Clustering Coefficient\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m avg_clustering\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# compute the average shortest path length and add it to the dataframe\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m average_shortest_path_length \u001b[38;5;241m=\u001b[39m \u001b[43maverage_shortest_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     26\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Shortest Path Length\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m average_shortest_path_length\n\u001b[1;32m     28\u001b[0m \u001b[38;5;66;03m# compute the betweenness centrality and add it to the dataframe\u001b[39;00m\n",
+      "File \u001b[0;32m~/github/small-worlds/utils.py:497\u001b[0m, in \u001b[0;36maverage_shortest_path\u001b[0;34m(G, k)\u001b[0m\n\u001b[1;32m    494\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m\\t\u001b[39;00m\u001b[39mNumber of edges after removing \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m% o\u001b[39;00m\u001b[39mf nodes: \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m.\u001b[39mformat((k)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m, G_copy\u001b[39m.\u001b[39mnumber_of_edges()))\n\u001b[1;32m    496\u001b[0m tmp \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m--> 497\u001b[0m connected_components \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(nx\u001b[39m.\u001b[39mconnected_components(G_copy))\n\u001b[1;32m    498\u001b[0m \u001b[39m# remove all the connected components with less than 10 nodes\u001b[39;00m\n\u001b[1;32m    499\u001b[0m connected_components \u001b[39m=\u001b[39m [c \u001b[39mfor\u001b[39;00m c \u001b[39min\u001b[39;00m connected_components \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(c) \u001b[39m>\u001b[39m \u001b[39m10\u001b[39m]\n",
+      "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'G_copy' referenced before assignment"
+     ]
+    }
+   ],
+   "source": [
+    "# do the same with the watts strogatz graphs\n",
+    "\n",
+    "analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
+    "\n",
+    "for graph in graphs_all:\n",
+    "    G = create_random_graphs(graph, 'watts_strogatz', save=False)\n",
+    "\n",
+    "    # add the basic information to the dataframe\n",
+    "    analysis_results_ws = analysis_results_ws.append({\n",
+    "        'Graph': G.name,\n",
+    "        'Number of Nodes': G.number_of_nodes(),\n",
+    "        'Number of Edges': G.number_of_edges(),\n",
+    "        'log N': np.log(G.number_of_nodes())\n",
+    "    }, ignore_index=True)\n",
+    "\n",
+    "    # compute the average degree and add it to the dataframe\n",
+    "    avg_deg = np.mean([d for n, d in G.degree()])\n",
+    "    analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Degree'] = avg_deg\n",
+    "\n",
+    "    # compute the average clustering coefficient and add it to the dataframe\n",
+    "    avg_clustering = nx.average_clustering(G)\n",
+    "    analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n",
+    "\n",
+    "    # compute the average shortest path length and add it to the dataframe\n",
+    "    average_shortest_path_length = average_shortest_path(G)\n",
+    "    analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
+    "\n",
+    "    # compute the betweenness centrality and add it to the dataframe\n",
+    "    betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
+    "    analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n",
+    "\n",
+    "    # save memory\n",
+    "    del G\n",
+    "\n",
+    "analysis_results_ws.to_pickle('analysis_results_ws.pkl')\n",
+    "analysis_results_ws"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G = nx.watts_strogatz_graph(1000, 4, 0.1)\n",
+    "adj = nx.to_scipy_sparse_array(G)\n",
+    "# print info about the graph and the matrix\n",
+    "print(\"Number of nodes: \", G.number_of_nodes())\n",
+    "print(\"Number of edges: \", G.number_of_edges())\n",
+    "print(\"Average degree: \", np.mean([d for n, d in G.degree()]))\n",
+    "print(\"Average clustering coefficient: \", nx.average_clustering(G))\n",
+    "print(\"Average shortest path length: \", nx.average_shortest_path_length(G))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy.sparse as sp\n",
+    "\n",
+    "# randomly swap edges, but keep the degree of each node the same (i.e. the degree sequence is preserved)\n",
+    "def random_swap_edges(adj, nswap=1, max_tries=100):\n",
+    "    # use numpy and scipy to speed up the process\n",
+    "    adj = sp.csr_matrix(adj)\n",
+    "    n, m = adj.shape \n",
+    "    assert n == m # make sure the adjacency matrix is square\n",
+    "    adj_triu = sp.triu(adj) # only consider the upper triangular part of the adjacency matrix\n",
+    "    adj_tuple = sp.find(adj_triu) # get the indices and values of the non-zero elements\n",
+    "    adj_edges = np.array(list(zip(adj_tuple[0], adj_tuple[1]))) # get the edges\n",
+    "    adj_data = adj_tuple[2] # get the edge weights\n",
+    "    nnz = adj_edges.shape[0] # number of non-zero elements\n",
+    "    assert nnz == adj_data.shape[0] # make sure the number of edges and edge weights are the same\n",
+    "    for _ in range(nswap): # repeat nswap times\n",
+    "        # choose random edges to swap\n",
+    "        edge_idx = np.random.choice(nnz, size=2, replace=False) # choose two random edges\n",
+    "        edge1 = adj_edges[edge_idx[0]] # get the first edge\n",
+    "        edge2 = adj_edges[edge_idx[1]] # get the second edge\n",
+    "        # make sure the edges are not self-loops and not already connected\n",
+    "        if edge1[0] == edge2[0] or edge1[0] == edge2[1] or edge1[1] == edge2[0] or edge1[1] == edge2[1] or adj[edge1[0], edge2[1]] or adj[edge2[0], edge1[1]]: \n",
+    "            continue # if the edges are self-loops or already connected, try again\n",
+    "        # swap the edges\n",
+    "        adj[edge1[0], edge1[1]] = 0 \n",
+    "        adj[edge2[0], edge2[1]] = 0     \n",
+    "        adj[edge1[0], edge2[1]] = 1\n",
+    "        adj[edge2[0], edge1[1]] = 1\n",
+    "        # update adj_edges and adj_data\n",
+    "        adj_edges[edge_idx[0]] = [edge1[0], edge2[1]]\n",
+    "        adj_edges[edge_idx[1]] = [edge2[0], edge1[1]]\n",
+    "        adj_data[edge_idx[0]] = 1\n",
+    "        adj_data[edge_idx[1]] = 1\n",
+    "    return adj\n",
+    "\n",
+    "adj_swapped = random_swap_edges(adj, nswap=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a new graph from the swapped adjacency matrix\n",
+    "G_swapped = nx.from_scipy_sparse_matrix(adj_swapped)\n",
+    "# print info about the graph and the matrix\n",
+    "print(\"Number of nodes: \", G_swapped.number_of_nodes())\n",
+    "print(\"Number of edges: \", G_swapped.number_of_edges())\n",
+    "print(\"Average degree: \", np.mean([d for n, d in G_swapped.degree()]))\n",
+    "print(\"Average clustering coefficient: \", nx.average_clustering(G_swapped))\n",
+    "print(\"Average shortest path length: \", nx.average_shortest_path_length(G_swapped))"
+   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.10.8 64-bit",
   "language": "python",
   "name": "python3"
  },
@ -234,12 +348,12 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
-    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
--- a/utils.py
+++ b/utils.py
@ -430,6 +430,9 @@ def betweenness_centrality_parallel(G, processes=None, k =None) -> dict:
            print("\tNumber of nodes after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_nodes()))
            print("\tNumber of edges after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_edges()))

+    if k is None:
+        G_copy = G.copy()
+
    p = Pool(processes=processes)
    node_divisor = len(p._pool) * 4
    node_chunks = list(chunks(G_copy.nodes(), G_copy.order() // node_divisor))
@ -481,6 +484,7 @@ def average_shortest_path(G: nx.Graph, k=None) -> float:
        if k is not None and (k < 0 or k > 1):
            raise ValueError("k must be between 0 and 1")
        elif k is None:
+            G = G.copy()
            connected_components = list(nx.connected_components(G))
        else:
            G_copy = G.copy()
@ -567,7 +571,7 @@ def create_random_graphs(G: nx.Graph, model = None, save = True) -> nx.Graph:
        G_random = nx.erdos_renyi_graph(G.number_of_nodes(), nx.density(G))
        print("\tNumber of edges in the original graph: {}" .format(G.number_of_edges()))
        print("\tNumber of edges in the random graph: {}" .format(G_random.number_of_edges()))
-        G_random.name = G.name + " erdos"
+        G_random.name = G.name + " Erdos-Renyi"

        if save:
            # check if the folder exists, otherwise create it
@ -585,7 +589,7 @@ def create_random_graphs(G: nx.Graph, model = None, save = True) -> nx.Graph:
        G_random = nx.watts_strogatz_graph(G.number_of_nodes(), avg_degree, p)
        print("\tNumber of edges in the original graph: {}" .format(G.number_of_edges()))
        print("\tNumber of edges in the random graph: {}" .format(G_random.number_of_edges()))
-        G_random.name = G.name + " watts_strogatz"
+        G_random.name = G.name + " Watts-Strogatz"

        if save:
            # check if the folder exists, otherwise create it