You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

363 lines
19 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import os\n",
"import wget\n",
"import zipfile\n",
"import numpy as np\n",
"import pandas as pd\n",
"import networkx as nx\n",
"import plotly.graph_objects as go\n",
"from utils import *\n",
"from collections import Counter\n",
"from tqdm import tqdm\n",
"import time\n",
"\n",
"# ignore warnings\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# import the graphs from the saved files\n",
"G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
"G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n",
"G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n",
"G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n",
"\n",
"G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n",
"G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n",
"G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n",
"G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n",
"\n",
"# # open the dataframe object\n",
"# analysis_results = pd.read_pickle('analysis_results.pkl')\n",
"# analysis_results"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"The first thing that we want to do is very simple, create a random reference for each graph"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
"friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
"\n",
"graphs_all = checkins_graphs + friendships_graph"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Original Graphs\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for graph in graphs_all:\n",
" # add basic graph statistics\n",
" analysis_results = analysis_results.append(\n",
" {'Graph': graph.name, \n",
" 'Number of Nodes': graph.number_of_nodes(), \n",
" 'log N': np.log(graph.number_of_nodes()),\n",
" 'Number of Edges': graph.number_of_edges()}, \n",
" ignore_index=True)\n",
"\n",
" # add average degree\n",
" print(\"Computing average degree for graph: \", graph.name)\n",
" avg_deg = np.mean([d for n, d in graph.degree()])\n",
" analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Degree'] = avg_deg\n",
"\n",
" # add average clustering coefficient\n",
" print(\"Computing average clustering coefficient for graph: \", graph.name)\n",
" avg_clustering = nx.average_clustering(graph)\n",
" analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Clustering Coefficient'] = avg_clustering\n",
"\n",
" # add average shortest path length\n",
" print(\"Computing average shortest path length for graph: \", graph.name)\n",
" average_shortest_path_length = average_shortest_path(graph)\n",
" analysis_results.loc[analysis_results['Graph'] == graph.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
"\n",
" # add betweenness centrality\n",
" print(\"Computing betweenness centrality for graph: \", graph.name)\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(graph, 6).values()))\n",
" analysis_results.loc[analysis_results['Graph'] == graph.name, 'betweenness centrality'] = betweenness_centrality\n",
" print()\n",
"\n",
"\n",
"analysis_results\n",
"analysis_results.to_pickle('analysis_results.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Random shit"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'name'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m 6\u001b[0m G \u001b[38;5;241m=\u001b[39m create_random_graphs(graph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merods\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# add the basic information to the dataframe\u001b[39;00m\n\u001b[1;32m 9\u001b[0m analysis_results_erods \u001b[38;5;241m=\u001b[39m analysis_results_erods\u001b[38;5;241m.\u001b[39mappend({\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mG\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m,\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Nodes\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_nodes(),\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNumber of Edges\u001b[39m\u001b[38;5;124m'\u001b[39m: G\u001b[38;5;241m.\u001b[39mnumber_of_edges(),\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog N\u001b[39m\u001b[38;5;124m'\u001b[39m: np\u001b[38;5;241m.\u001b[39mlog(G\u001b[38;5;241m.\u001b[39mnumber_of_nodes())\n\u001b[1;32m 14\u001b[0m }, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# compute the average degree and add it to the dataframe\u001b[39;00m\n\u001b[1;32m 17\u001b[0m avg_deg \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmean([d \u001b[38;5;28;01mfor\u001b[39;00m n, d \u001b[38;5;129;01min\u001b[39;00m G\u001b[38;5;241m.\u001b[39mdegree()])\n",
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'name'"
]
}
],
"source": [
"analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"# read all the graphs gpickle files in the data/random/erdos folder. Then run the same analysis as before for this graphs\n",
"\n",
"for graph in graphs_all:\n",
" G = create_random_graphs(graph, \"erods\")\n",
"\n",
" # add the basic information to the dataframe\n",
" analysis_results_erods = analysis_results_erods.append({\n",
" 'Graph': G.name,\n",
" 'Number of Nodes': G.number_of_nodes(),\n",
" 'Number of Edges': G.number_of_edges(),\n",
" 'log N': np.log(G.number_of_nodes())\n",
" }, ignore_index=True)\n",
"\n",
" # compute the average degree and add it to the dataframe\n",
" avg_deg = np.mean([d for n, d in G.degree()])\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Degree'] = avg_deg\n",
"\n",
" # compute the average clustering coefficient and add it to the dataframe\n",
" avg_clustering = nx.average_clustering(G)\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n",
"\n",
" # compute the average shortest path length and add it to the dataframe\n",
" average_shortest_path_length = average_shortest_path(G)\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
"\n",
" # compute the betweenness centrality and add it to the dataframe\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
" analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n",
"\n",
" # save memory\n",
" del G\n",
"\n",
"analysis_results_erods.to_pickle('analysis_results_erods.pkl')\n",
"analysis_results_erods\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\tNumber of edges in the original graph: 3663807\n",
"\tNumber of edges in the random graph: 3660219\n"
]
},
{
"ename": "UnboundLocalError",
"evalue": "local variable 'G_copy' referenced before assignment",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 25\u001b[0m\n\u001b[1;32m 22\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Clustering Coefficient\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m avg_clustering\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# compute the average shortest path length and add it to the dataframe\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m average_shortest_path_length \u001b[38;5;241m=\u001b[39m \u001b[43maverage_shortest_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mG\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m analysis_results_ws\u001b[38;5;241m.\u001b[39mloc[analysis_results_ws[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGraph\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m G\u001b[38;5;241m.\u001b[39mname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage Shortest Path Length\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m average_shortest_path_length\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# compute the betweenness centrality and add it to the dataframe\u001b[39;00m\n",
"File \u001b[0;32m~/github/small-worlds/utils.py:497\u001b[0m, in \u001b[0;36maverage_shortest_path\u001b[0;34m(G, k)\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m\\t\u001b[39;00m\u001b[39mNumber of edges after removing \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m% o\u001b[39;00m\u001b[39mf nodes: \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m.\u001b[39mformat((k)\u001b[39m*\u001b[39m\u001b[39m100\u001b[39m, G_copy\u001b[39m.\u001b[39mnumber_of_edges()))\n\u001b[1;32m 496\u001b[0m tmp \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m--> 497\u001b[0m connected_components \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(nx\u001b[39m.\u001b[39mconnected_components(G_copy))\n\u001b[1;32m 498\u001b[0m \u001b[39m# remove all the connected components with less than 10 nodes\u001b[39;00m\n\u001b[1;32m 499\u001b[0m connected_components \u001b[39m=\u001b[39m [c \u001b[39mfor\u001b[39;00m c \u001b[39min\u001b[39;00m connected_components \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(c) \u001b[39m>\u001b[39m \u001b[39m10\u001b[39m]\n",
"\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'G_copy' referenced before assignment"
]
}
],
"source": [
"# do the same with the watts strogatz graphs\n",
"\n",
"analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
"\n",
"for graph in graphs_all:\n",
" G = create_random_graphs(graph, 'watts_strogatz', save=False)\n",
"\n",
" # add the basic information to the dataframe\n",
" analysis_results_ws = analysis_results_ws.append({\n",
" 'Graph': G.name,\n",
" 'Number of Nodes': G.number_of_nodes(),\n",
" 'Number of Edges': G.number_of_edges(),\n",
" 'log N': np.log(G.number_of_nodes())\n",
" }, ignore_index=True)\n",
"\n",
" # compute the average degree and add it to the dataframe\n",
" avg_deg = np.mean([d for n, d in G.degree()])\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Degree'] = avg_deg\n",
"\n",
" # compute the average clustering coefficient and add it to the dataframe\n",
" avg_clustering = nx.average_clustering(G)\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Clustering Coefficient'] = avg_clustering\n",
"\n",
" # compute the average shortest path length and add it to the dataframe\n",
" average_shortest_path_length = average_shortest_path(G)\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length\n",
"\n",
" # compute the betweenness centrality and add it to the dataframe\n",
" betweenness_centrality = np.mean(list(betweenness_centrality_parallel(G, 6).values()))\n",
" analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality\n",
"\n",
" # save memory\n",
" del G\n",
"\n",
"analysis_results_ws.to_pickle('analysis_results_ws.pkl')\n",
"analysis_results_ws"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G = nx.watts_strogatz_graph(1000, 4, 0.1)\n",
"adj = nx.to_scipy_sparse_array(G)\n",
"# print info about the graph and the matrix\n",
"print(\"Number of nodes: \", G.number_of_nodes())\n",
"print(\"Number of edges: \", G.number_of_edges())\n",
"print(\"Average degree: \", np.mean([d for n, d in G.degree()]))\n",
"print(\"Average clustering coefficient: \", nx.average_clustering(G))\n",
"print(\"Average shortest path length: \", nx.average_shortest_path_length(G))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import scipy.sparse as sp\n",
"\n",
"# randomly swap edges, but keep the degree of each node the same (i.e. the degree sequence is preserved)\n",
"def random_swap_edges(adj, nswap=1, max_tries=100):\n",
" # use numpy and scipy to speed up the process\n",
" adj = sp.csr_matrix(adj)\n",
" n, m = adj.shape \n",
" assert n == m # make sure the adjacency matrix is square\n",
" adj_triu = sp.triu(adj) # only consider the upper triangular part of the adjacency matrix\n",
" adj_tuple = sp.find(adj_triu) # get the indices and values of the non-zero elements\n",
" adj_edges = np.array(list(zip(adj_tuple[0], adj_tuple[1]))) # get the edges\n",
" adj_data = adj_tuple[2] # get the edge weights\n",
" nnz = adj_edges.shape[0] # number of non-zero elements\n",
" assert nnz == adj_data.shape[0] # make sure the number of edges and edge weights are the same\n",
" for _ in range(nswap): # repeat nswap times\n",
" # choose random edges to swap\n",
" edge_idx = np.random.choice(nnz, size=2, replace=False) # choose two random edges\n",
" edge1 = adj_edges[edge_idx[0]] # get the first edge\n",
" edge2 = adj_edges[edge_idx[1]] # get the second edge\n",
" # make sure the edges are not self-loops and not already connected\n",
" if edge1[0] == edge2[0] or edge1[0] == edge2[1] or edge1[1] == edge2[0] or edge1[1] == edge2[1] or adj[edge1[0], edge2[1]] or adj[edge2[0], edge1[1]]: \n",
" continue # if the edges are self-loops or already connected, try again\n",
" # swap the edges\n",
" adj[edge1[0], edge1[1]] = 0 \n",
" adj[edge2[0], edge2[1]] = 0 \n",
" adj[edge1[0], edge2[1]] = 1\n",
" adj[edge2[0], edge1[1]] = 1\n",
" # update adj_edges and adj_data\n",
" adj_edges[edge_idx[0]] = [edge1[0], edge2[1]]\n",
" adj_edges[edge_idx[1]] = [edge2[0], edge1[1]]\n",
" adj_data[edge_idx[0]] = 1\n",
" adj_data[edge_idx[1]] = 1\n",
" return adj\n",
"\n",
"adj_swapped = random_swap_edges(adj, nswap=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create a new graph from the swapped adjacency matrix\n",
"G_swapped = nx.from_scipy_sparse_matrix(adj_swapped)\n",
"# print info about the graph and the matrix\n",
"print(\"Number of nodes: \", G_swapped.number_of_nodes())\n",
"print(\"Number of edges: \", G_swapped.number_of_edges())\n",
"print(\"Average degree: \", np.mean([d for n, d in G_swapped.degree()]))\n",
"print(\"Average clustering coefficient: \", nx.average_clustering(G_swapped))\n",
"print(\"Average shortest path length: \", nx.average_shortest_path_length(G_swapped))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.8 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}