various fixes. Download still broken

2 years ago · 54be6ef526
parent b429ea34e5
commit 54be6ef526
13 changed files with 15950 additions and 1084 deletions
--- a/.gitignore
+++ b/.gitignore
@ -147,3 +147,4 @@ data/
 backup/
 sources/
 testing.ipynb
 extra/
--- a/EXTRA/bob
+++ b/EXTRA/bob
--- a/EXTRA/create_checkins_graph.cpp
+++ b/EXTRA/create_checkins_graph.cpp
@ -1,122 +0,0 @@
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <mutex>
 #include <thread>
 #include <functional>
 #include <set>
 using namespace std;
 // It receives the file name as a string and returns a dictionary with the keys being the UserID and the values being a vector of VenueID associated with that UserID.
 unordered_map<string, set<string>> createDictFromFile(string filename) {
    // Create an empty dictionary
    unordered_map<string, set<string>> dict;
    // Open the file
    ifstream file(filename);
    // Check if the file was opened successfully
    if (!file.is_open()) {
        cerr << "Error opening file " << filename << endl;
        return dict;
    }
    // Read the file line by line
    string userId, venueId;
    while (file.good()) {
        file >> userId >> venueId;
        // Add the venueId to the vector of venues associated with the userId
        dict[userId].insert(venueId);
    }
    cout << "Dict created" << endl;
    // Return the dictionary
    return dict;
 }
 // void create_tsv_multi(unordered_map<string, vector<string>> dict, mutex& dict_mutex) {
 //       // Create an output stream to write the file
 //       ofstream out_file("output.tsv");
 //       // Create a mutex to protect the output file
 //       mutex out_file_mutex;
 //       // Loop over all the key-value pairs in the map
 //       for (const auto& kv1 : dict) {
 //         for (const auto& kv2 : dict) {
 //           // Check if the keys are the same
 //           if (kv1.first == kv2.first) continue;
 //           // Check if the values have elements in common
 //           vector<string> common;
 //           for (const auto& str1 : kv1.second) {
 //             for (const auto& str2 : kv2.second) {
 //               if (str1 == str2) common.push_back(str1);
 //             }
 //           }
 //           // Write the keys and the number of common elements to the output file
 //           if (!common.empty()) {
 //             // Lock the mutexes before accessing the dict and the output file
 //             lock_guard<mutex> dict_guard(dict_mutex);
 //             lock_guard<mutex> out_file_guard(out_file_mutex);
 //             out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
 //           }
 //         }
 //       }
 //     }
 void create_tsv(const unordered_map<string, set<string>>& dict, string outfilename) {
    // Create an output stream to write the file
    ofstream out_file(outfilename);
    // Loop over all the key-value pairs in the map
    unsigned long long i = 0;
    for (const auto& kv1 : dict) {
        if (!((++i) & 127)) cout << (((double)i) * 100 / dict.size()) << "%\r" << flush;
        for (const auto& kv2 : dict) {
            // Check if the keys are the same
            if(kv1.first >= kv2.first) continue;
            // Check if the values have elements in common
            set<string> common;
            set_intersection(kv1.second.begin(), kv1.second.end(), kv2.second.begin(), kv2.second.end(), inserter(common, common.begin()));
            // Write the keys and the number of common elements to the output file
            if (!common.empty()) {
                out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
                // cout << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
            }
        }
    }
 }
 void print_help() {
    cout << "Usage: ./main IN_FILE_PATH OUT_FILE_PATH" << endl;
    cout << "Suggested options: \n\t./main data/brightkite/brightkite_checkins.txt data/brightkite/brightkite_checkins_graph.tsv \n\t./main data/gowalla/gowalla_checkins.txt data/gowalla/gowalla_checkins_graph.tsv \n\t./main data/foursquare/foursquare_checkins.txt data/foursquare/foursquare_checkins_graph.tsv" << endl;
 }
 // int main() {
 //     unordered_map<string, set<string>> dict = createDictFromFile("data/foursquare/foursquare_checkins_TKY.txt");
 //     create_tsv(dict);
 // }
 int main(int argc, const char* argv[]) {
    if (argc == 3) {
        string in_file = argv[1];
        string out_file = argv[2];
        if (in_file == "-h" || in_file == "--help" || out_file == "-h" || out_file == "--help") {
            print_help();
            return 0;
        }
        unordered_map<string, set<string>> dict = createDictFromFile(in_file);
        create_tsv(dict, out_file);
        return 0;
    } else {
        print_help();
        return 0;
    }
 }
--- a/EXTRA/small_world.cpp
+++ b/EXTRA/small_world.cpp
@ -1,73 +0,0 @@
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <vector>
 #include <utility>
 #include <boost/graph/adjacency_list.hpp>
 #include <boost/graph/small_world_generator.hpp>
 #include <boost/random/linear_congruential.hpp>
 using namespace std;
 using namespace boost;
 typedef adjacency_list<vecS, vecS, undirectedS, no_property, no_property> Graph;
 typedef small_world_iterator<minstd_rand, Graph> SWGen;
 vector<pair<int, int>> lattice_reference(const string& edge_list_file, int niter, bool connectivity) {
    vector<pair<int, int>> edges;
    int num_nodes = 0;
    // Read in the edge list from the input file
    ifstream in(edge_list_file);
    string line;
    while (getline(in, line)) {
        int u, v;
        sscanf(line.c_str(), "%d\t%d", &u, &v);
        edges.emplace_back(u, v);
        num_nodes = max(num_nodes, max(u, v));
    }
    // Construct the graph from the edge list
    Graph g(edges.begin(), edges.end(), num_nodes + 1);
    // Create the small-world generator
    minstd_rand gen;
    SWGen sw_gen(g, niter);
    // Generate the lattice reference and store the resulting edge list
    vector<pair<int, int>> lattice_edges;
    for (int i = 0; i < num_nodes; ++i) {
        auto [u, v] = *sw_gen;
        lattice_edges.emplace_back(u, v);
        ++sw_gen;
    }
    // convert the vector of pairs in a .tsv file called "lattice_reference.tsv"
    ofstream out("lattice_reference.tsv");
    for (const auto& [u, v] : lattice_edges) {
        out << u << "\t" << v << endl;
    }
    // return the vector of pairs
    return lattice_edges;
 }
 // main
 int main(int argc, char* argv[]) {
    if (argc != 4) {
        cerr << "Usage: " << argv[0] << " <edge_list_file> <niter> <connectivity>" << endl;
        return 1;
    }
    string edge_list_file = argv[1];
    int niter = atoi(argv[2]);
    bool connectivity = atoi(argv[3]);
    lattice_reference(edge_list_file, niter, connectivity);
    return 0;
 }
--- a/analysis_results.pkl
+++ b/analysis_results.pkl
--- a/analysis_results_erods.pkl
+++ b/analysis_results_erods.pkl
--- a/analysis_results_ws.pkl
+++ b/analysis_results_ws.pkl
--- a/foursquareIT_friends.html
+++ b/foursquareIT_friends.html
--- a/main.ipynb
+++ b/main.ipynb
--- a/omega_sampled_server.py
+++ b/omega_sampled_server.py
@ -0,0 +1,62 @@
 #! /usr/bin/python3
 import networkx as nx
 from utils import *
 import warnings
 import time
 import random
 import argparse
 warnings.filterwarnings("ignore")
 def random_sample(graph, k):
    nodes = list(graph.nodes())
    n = int(k*len(nodes))
    nodes_sample = random.sample(nodes, n)
    G = graph.subgraph(nodes_sample)
    if not nx.is_connected(G):
        print("Graph is not connected. Taking the largest connected component")
        connected = max(nx.connected_components(G), key=len)
        G_connected = graph.subgraph(connected)
    print(nx.is_connected(G_connected))
    print("Number of nodes in the sampled graph: ", G.number_of_nodes())
    print("Number of edges in the sampled graph: ", G.number_of_edges())
    return G_connected
 if __name__ == "__main__":
    # use argparse to take as input the name of the graph, the options are "foursquare", "gowalla" and "brightkite"
    parser = argparse.ArgumentParser()
    parser.add_argument("graph", help="Name of the graph to be used. Options are 'foursquare', 'gowalla' and 'brightkite'")
    parser.add_argument("k", help="Percentage of nodes to be sampled. Needs to be a float between 0 and 1")
    parser.add_argument("niter", help="Number of rewiring per edge. Needs to be an integer. Default is 5")
    parser.add_argument("nrand", help="Number of random graphs. Needs to be an integer. Default is 5")
    parser.add_help = True
    args = parser.parse_args()
    # if no input is given for niter and nrand, set them to default values
    if args.niter == None:
        print("No input for niter. Setting it to default value: 5")
        args.niter = 5
    if args.nrand == None:
        print("No input for nrand. Setting it to default value: 5")
        args.nrand = 5
    # create the graph. G = create_graph_from_checkins('name') where name is the input argument of graph
    G = create_graph_from_checkins(str(args.graph))
    G.name = str(args.graph) + " Checkins Graph"
    # sample the graph
    G_sample = random_sample(G, float(args.k))
    # compute omega
    start = time.time()
    print("\nComputing omega for graph: ", G.name)
    omega = nx.omega(G_sample, niter = int(args.niter), nrand = int(args.nrand))
    end = time.time()
    print("Omega coefficient for graph {}: {}".format(G.name, omega))
    print("Time taken: ", round(end-start,2))
--- a/project.pdf
+++ b/project.pdf
--- a/testing.ipynb
+++ b/testing.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -16,372 +16,106 @@
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import plotly.graph_objects as go\n",
-    "from utils import *\n",
+    "# from utils import *\n",
    "from collections import Counter\n",
    "from tqdm import tqdm\n",
    "import time\n",
    "import geopandas as gpd\n",
    "import gdown # for downloading files from google drive\n",
    "\n",
    "# ignore warnings\n",
    "import warnings\n",
    "import sys\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Graph</th>\n",
       "      <th>Number of Nodes</th>\n",
       "      <th>Number of Edges</th>\n",
       "      <th>Average Degree</th>\n",
       "      <th>Average Clustering Coefficient</th>\n",
       "      <th>log N</th>\n",
       "      <th>Average Shortest Path Length</th>\n",
       "      <th>betweenness centrality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brightkite Checkins Graph</td>\n",
       "      <td>7191</td>\n",
       "      <td>3663807</td>\n",
       "      <td>1018.997914</td>\n",
       "      <td>0.702854</td>\n",
       "      <td>8.880586</td>\n",
       "      <td>2.411011</td>\n",
       "      <td>0.00022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Gowalla Checkins Graph</td>\n",
       "      <td>10702</td>\n",
       "      <td>303104</td>\n",
       "      <td>56.644366</td>\n",
       "      <td>0.505597</td>\n",
       "      <td>9.278186</td>\n",
       "      <td>5.222903</td>\n",
       "      <td>0.000301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Foursquare EU Checkins Graph</td>\n",
       "      <td>20282</td>\n",
       "      <td>7430376</td>\n",
       "      <td>732.706439</td>\n",
       "      <td>0.597097</td>\n",
       "      <td>9.917489</td>\n",
       "      <td>2.2843</td>\n",
       "      <td>0.000089</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Foursquare IT Checkins Graph</td>\n",
       "      <td>3730</td>\n",
       "      <td>629749</td>\n",
       "      <td>337.667024</td>\n",
       "      <td>0.683565</td>\n",
       "      <td>8.224164</td>\n",
       "      <td>2.185477</td>\n",
       "      <td>0.000428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Brightkite Friendship Graph</td>\n",
       "      <td>5928</td>\n",
       "      <td>34673</td>\n",
       "      <td>11.698043</td>\n",
       "      <td>0.219749</td>\n",
       "      <td>8.687442</td>\n",
       "      <td>5.052162</td>\n",
       "      <td>0.000448</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>(Filtered) Gowalla Friendship Graph</td>\n",
       "      <td>8396</td>\n",
       "      <td>29122</td>\n",
       "      <td>6.937113</td>\n",
       "      <td>0.217544</td>\n",
       "      <td>9.035511</td>\n",
       "      <td>4.558532</td>\n",
       "      <td>0.000357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Foursquare IT Friendship Graph</td>\n",
       "      <td>2073</td>\n",
       "      <td>6217</td>\n",
       "      <td>5.99807</td>\n",
       "      <td>0.148489</td>\n",
       "      <td>7.636752</td>\n",
       "      <td>19.530752</td>\n",
       "      <td>0.000879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Foursquare EU Friendship Graph</td>\n",
       "      <td>16491</td>\n",
       "      <td>59419</td>\n",
       "      <td>7.206234</td>\n",
       "      <td>0.167946</td>\n",
       "      <td>9.710570</td>\n",
       "      <td>23.713864</td>\n",
       "      <td>0.000272</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 Graph Number of Nodes Number of Edges  \\\n",
       "0            Brightkite Checkins Graph            7191         3663807   \n",
       "1               Gowalla Checkins Graph           10702          303104   \n",
       "2         Foursquare EU Checkins Graph           20282         7430376   \n",
       "3         Foursquare IT Checkins Graph            3730          629749   \n",
       "4          Brightkite Friendship Graph            5928           34673   \n",
       "5  (Filtered) Gowalla Friendship Graph            8396           29122   \n",
       "6       Foursquare IT Friendship Graph            2073            6217   \n",
       "7       Foursquare EU Friendship Graph           16491           59419   \n",
       "\n",
       "  Average Degree Average Clustering Coefficient     log N  \\\n",
       "0    1018.997914                       0.702854  8.880586   \n",
       "1      56.644366                       0.505597  9.278186   \n",
       "2     732.706439                       0.597097  9.917489   \n",
       "3     337.667024                       0.683565  8.224164   \n",
       "4      11.698043                       0.219749  8.687442   \n",
       "5       6.937113                       0.217544  9.035511   \n",
       "6        5.99807                       0.148489  7.636752   \n",
       "7       7.206234                       0.167946  9.710570   \n",
       "\n",
       "  Average Shortest Path Length betweenness centrality  \n",
       "0                     2.411011                0.00022  \n",
       "1                     5.222903               0.000301  \n",
       "2                       2.2843               0.000089  \n",
       "3                     2.185477               0.000428  \n",
       "4                     5.052162               0.000448  \n",
       "5                     4.558532               0.000357  \n",
       "6                    19.530752               0.000879  \n",
       "7                    23.713864               0.000272  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import the graphs from the saved files\n",
    "G_brighkite_checkins = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_checkins_graph.gpickle'))\n",
    "G_gowalla_checkins = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_checkins_graph.gpickle'))\n",
    "G_foursquareEU_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_checkins_graph.gpickle'))\n",
    "G_foursquareIT_checkins = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_checkins_graph.gpickle'))\n",
    "\n",
    "G_brighkite_friends = nx.read_gpickle(os.path.join('data', 'brightkite', 'brightkite_friendships_graph.gpickle'))\n",
    "G_gowalla_friends = nx.read_gpickle(os.path.join('data', 'gowalla', 'gowalla_friendships_graph.gpickle'))\n",
    "G_foursquareEU_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareEU_friendships_graph.gpickle'))\n",
    "G_foursquareIT_friends = nx.read_gpickle(os.path.join('data', 'foursquare', 'foursquareIT_friendships_graph.gpickle'))\n",
    "\n",
    "# open the dataframe object\n",
    "analysis_results = pd.read_pickle('analysis_results.pkl')\n",
    "analysis_results"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first thing that we want to do is very simple, create a random reference for each graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
+    "def download_dataTMPsets():\n",
-    "\n",
+    "\n",
-    "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n",
+    "    dict = {\n",
-    "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n",
+    "        \"brightkite\": [\"https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz\", \"https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz\"], \n",
-    "\n",
+    "        \"gowalla\": [\"https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz\", \"https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz\"], \n",
-    "graphs_all = checkins_graphs + friendships_graph"
+    "        \"foursquare\": \"https://drive.google.com/file/d/1PNk3zY8NjLcDiAbzjABzY5FiPAFHq6T8/view?usp=sharing\"}\n",
-   ]
+    "\n",
-  },
+    "    if not os.path.exists(\"dataTMP\"):\n",
-  {
+    "        os.mkdir(\"dataTMP\")\n",
-   "cell_type": "markdown",
+    "        print(\"Created dataTMP folder\")\n",
-   "metadata": {},
+    "\n",
-   "source": [
+    "    for folder in dict.keys():\n",
-    "# Random shit"
+    "        if not os.path.exists(os.path.join(\"dataTMP\", folder)):\n",
    "            os.mkdir(os.path.join(\"dataTMP\", folder))\n",
    "            print(\"Created {} folder\".format(folder))\n",
    "\n",
    "    for folder in dict.keys():\n",
    "        for url in dict[folder]:\n",
    "            if folder == \"foursquare\":\n",
    "                if not os.path.exists(os.path.join(\"dataTMP\", folder, \"foursquare_full.zip\")):\n",
    "                    output = os.path.join(\"dataTMP\", folder, \"foursquare_full.zip\")\n",
    "                    gdown.download(url, output, quiet=False, fuzzy=True)\n",
    "                else :\n",
    "                    print(\"{} already downloaded\".format(url))\n",
    "            else:\n",
    "                if not os.path.exists(os.path.join(\"dataTMP\", folder, url.split(\"/\")[-1])):\n",
    "                    print(\"Downloading {}...\".format(url))\n",
    "                    wget.download(url, os.path.join(\"dataTMP\", folder))\n",
    "                else :\n",
    "                    print(\"{} already downloaded\".format(url))\n",
    "\n",
    "    for folder in dict.keys():\n",
    "        for file in os.listdir(os.path.join(\"dataTMP\", folder)):\n",
    "            if file.endswith(\".gz\"):\n",
    "                print(\"Unzipping {}...\".format(file))\n",
    "                os.system(\"gunzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n",
    "            elif file.endswith(\".zip\"):\n",
    "                print(\"Unzipping {}...\".format(file))\n",
    "                os.system(\"unzip {}\".format(os.path.join(\"dataTMP\", folder, file)))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# analysis_results_erods = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "# analysis_results_ws = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n",
    "\n",
    "# for graph in graphs_all:\n",
    "#     print(\"\\nCreating random graph for graph: \", graph.name)\n",
    "#     G_erd = create_random_graphs(graph, model='erdos', save=False)\n",
    "#     G_ws = create_random_graphs(graph, model='watts_strogatz', save=False)\n",
    "    \n",
    "#     # add the basic information to the dataframe\n",
    "#     analysis_results_erods = analysis_results_erods.append({\n",
    "#         'Graph': G_erd.name,\n",
    "#         'Number of Nodes': G_erd.number_of_nodes(),\n",
    "#         'Number of Edges': G_erd.number_of_edges(),\n",
    "#         'log N': np.log(G_erd.number_of_nodes())\n",
    "#     }, ignore_index=True)\n",
    "\n",
    "#     # add the basic information to the dataframe\n",
    "#     analysis_results_ws = analysis_results_ws.append({\n",
    "#         'Graph': G_ws.name,\n",
    "#         'Number of Nodes': G_ws.number_of_nodes(),\n",
    "#         'Number of Edges': G_ws.number_of_edges(),\n",
    "#         'log N': np.log(G_ws.number_of_nodes())\n",
    "#     }, ignore_index=True)\n",
    "\n",
    "#     # compute the average degree and add it to the dataframes\n",
    "#     avg_deg_erd = np.mean([d for n, d in G_erd.degree()])\n",
    "#     avg_deg_ws = np.mean([d for n, d in G_ws.degree()])\n",
    "#     analysis_results_erods.loc[analysis_results_erods['Graph'] == G_erd.name, 'Average Degree'] = avg_deg_erd\n",
    "#     analysis_results_ws.loc[analysis_results_ws['Graph'] == G_ws.name, 'Average Degree'] = avg_deg_ws\n",
    "\n",
    "#     # compute the average clustering coefficient and add it to the dataframes\n",
    "#     avg_clustering_erd = average_clustering_coefficient(G_erd, k = 0.9)\n",
    "#     avg_clustering_ws = average_clustering_coefficient(G_ws, k = 0.9)\n",
    "#     analysis_results_erods.loc[analysis_results_erods['Graph'] == G_erd.name, 'Average Clustering Coefficient'] = avg_clustering_erd\n",
    "#     analysis_results_ws.loc[analysis_results_ws['Graph'] == G_ws.name, 'Average Clustering Coefficient'] = avg_clustering_ws\n",
    "\n",
    "#     # compute the average shortest path length and add it to the dataframes\n",
    "#     average_shortest_path_length_erd = average_shortest_path(G_erd, k = 0.9)\n",
    "#     average_shortest_path_length_ws = average_shortest_path(G_ws, k = 0.9)\n",
    "#     analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length_erd\n",
    "#     analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'Average Shortest Path Length'] = average_shortest_path_length_ws\n",
    "\n",
    "#     # compute the betweenness centrality and add it to the dataframes\n",
    "#     betweenness_centrality_erd = np.mean(list(betweenness_centrality_parallel(G_erd, 4, k = 0.9).values()))\n",
    "#     betweenness_centrality_ws = np.mean(list(betweenness_centrality_parallel(G_ws, 4, k = 0.9).values()))\n",
    "#     analysis_results_erods.loc[analysis_results_erods['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality_erd\n",
    "#     analysis_results_ws.loc[analysis_results_ws['Graph'] == G.name, 'betweenness centrality'] = betweenness_centrality_ws\n",
    "\n",
    "#     # save memory\n",
    "#     del G_erd, G_ws\n",
    "\n",
    "# analysis_results_erods.to_pickle('analysis_results_erods.pkl')\n",
    "# analysis_results_ws.to_pickle('analysis_results_ws.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Small Worldness\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have already computed the average clusting coefficient and the average shortesh path len for our networks. We can save a lot of time by skipping this computations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def omega(G, C_og, L_og, niter, nrand):\n",
    "    randMetrics = {\"C\": [], \"L\": []}\n",
    "\n",
    "    # Calculate initial average clustering coefficient which potentially will\n",
    "    # get replaced by higher clustering coefficients from generated lattice\n",
    "    # reference graphs\n",
    "    Cl = C_og\n",
    "\n",
    "    niter_lattice_reference = niter\n",
    "    niter_random_reference = niter * 2\n",
    "\n",
    "    for _ in range(nrand):\n",
    "        \n",
    "        # Generate random graph\n",
    "        Gr = nx.random_reference(G, niter=niter_random_reference, seed=42)\n",
    "        randMetrics[\"L\"].append(nx.average_shortest_path_length(Gr))\n",
    "\n",
    "        # Generate lattice graph\n",
    "        Gl = nx.lattice_reference(G, niter=niter_lattice_reference, seed=42)\n",
    "\n",
    "        # Replace old clustering coefficient, if clustering is higher in\n",
    "        # generated lattice reference\n",
    "        Cl_temp = nx.average_clustering(Gl)\n",
    "        if Cl_temp > Cl:\n",
    "            Cl = Cl_temp\n",
    "\n",
    "    C = C_og\n",
    "    L = L_og\n",
    "    Lr = np.mean(randMetrics[\"L\"])\n",
    "\n",
    "    omega = (Lr / L) - (C / Cl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Brightkite Checkins Graph\n"
+      "Created dataTMP folder\n",
      "Created brightkite folder\n",
      "Created gowalla folder\n",
      "Created foursquare folder\n",
      "Downloading https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz...\n",
      "Downloading https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz...\n",
      "Downloading https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz...\n",
      "Downloading https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz...\n"
     ]
    },
    {
     "ename": "MissingSchema",
     "evalue": "Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mMissingSchema\u001b[0m                             Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m download_dataTMPsets()\n",
      "Cell \u001b[0;32mIn[8], line 22\u001b[0m, in \u001b[0;36mdownload_dataTMPsets\u001b[0;34m()\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mexists(os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39m\"\u001b[39m\u001b[39mdataTMP\u001b[39m\u001b[39m\"\u001b[39m, folder, \u001b[39m\"\u001b[39m\u001b[39mfoursquare_full.zip\u001b[39m\u001b[39m\"\u001b[39m)):\n\u001b[1;32m     21\u001b[0m     output \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39m\"\u001b[39m\u001b[39mdataTMP\u001b[39m\u001b[39m\"\u001b[39m, folder, \u001b[39m\"\u001b[39m\u001b[39mfoursquare_full.zip\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m---> 22\u001b[0m     gdown\u001b[39m.\u001b[39;49mdownload(url, output, quiet\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, fuzzy\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m     23\u001b[0m \u001b[39melse\u001b[39;00m :\n\u001b[1;32m     24\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m already downloaded\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(url))\n",
      "File \u001b[0;32m/usr/lib/python3.10/site-packages/gdown/download.py:158\u001b[0m, in \u001b[0;36mdownload\u001b[0;34m(url, output, quiet, proxy, speed, use_cookies, verify, id, fuzzy, resume)\u001b[0m\n\u001b[1;32m    156\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m    157\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m         res \u001b[39m=\u001b[39m sess\u001b[39m.\u001b[39;49mget(url, headers\u001b[39m=\u001b[39;49mheaders, stream\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, verify\u001b[39m=\u001b[39;49mverify)\n\u001b[1;32m    159\u001b[0m     \u001b[39mexcept\u001b[39;00m requests\u001b[39m.\u001b[39mexceptions\u001b[39m.\u001b[39mProxyError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    160\u001b[0m         \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mAn error has occurred using proxy:\u001b[39m\u001b[39m\"\u001b[39m, proxy, file\u001b[39m=\u001b[39msys\u001b[39m.\u001b[39mstderr)\n",
      "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:600\u001b[0m, in \u001b[0;36mSession.get\u001b[0;34m(self, url, **kwargs)\u001b[0m\n\u001b[1;32m    592\u001b[0m \u001b[39m\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\"\"Sends a GET request. Returns :class:`Response` object.\u001b[39;00m\n\u001b[1;32m    593\u001b[0m \n\u001b[1;32m    594\u001b[0m \u001b[39m:param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m    595\u001b[0m \u001b[39m:param \\*\\*kwargs: Optional arguments that ``request`` takes.\u001b[39;00m\n\u001b[1;32m    596\u001b[0m \u001b[39m:rtype: requests.Response\u001b[39;00m\n\u001b[1;32m    597\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    599\u001b[0m kwargs\u001b[39m.\u001b[39msetdefault(\u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m--> 600\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrequest(\u001b[39m\"\u001b[39;49m\u001b[39mGET\u001b[39;49m\u001b[39m\"\u001b[39;49m, url, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:573\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m    560\u001b[0m \u001b[39m# Create the Request.\u001b[39;00m\n\u001b[1;32m    561\u001b[0m req \u001b[39m=\u001b[39m Request(\n\u001b[1;32m    562\u001b[0m     method\u001b[39m=\u001b[39mmethod\u001b[39m.\u001b[39mupper(),\n\u001b[1;32m    563\u001b[0m     url\u001b[39m=\u001b[39murl,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    571\u001b[0m     hooks\u001b[39m=\u001b[39mhooks,\n\u001b[1;32m    572\u001b[0m )\n\u001b[0;32m--> 573\u001b[0m prep \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprepare_request(req)\n\u001b[1;32m    575\u001b[0m proxies \u001b[39m=\u001b[39m proxies \u001b[39mor\u001b[39;00m {}\n\u001b[1;32m    577\u001b[0m settings \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmerge_environment_settings(\n\u001b[1;32m    578\u001b[0m     prep\u001b[39m.\u001b[39murl, proxies, stream, verify, cert\n\u001b[1;32m    579\u001b[0m )\n",
      "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/sessions.py:484\u001b[0m, in \u001b[0;36mSession.prepare_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    481\u001b[0m     auth \u001b[39m=\u001b[39m get_netrc_auth(request\u001b[39m.\u001b[39murl)\n\u001b[1;32m    483\u001b[0m p \u001b[39m=\u001b[39m PreparedRequest()\n\u001b[0;32m--> 484\u001b[0m p\u001b[39m.\u001b[39;49mprepare(\n\u001b[1;32m    485\u001b[0m     method\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mmethod\u001b[39m.\u001b[39;49mupper(),\n\u001b[1;32m    486\u001b[0m     url\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49murl,\n\u001b[1;32m    487\u001b[0m     files\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mfiles,\n\u001b[1;32m    488\u001b[0m     data\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mdata,\n\u001b[1;32m    489\u001b[0m     json\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mjson,\n\u001b[1;32m    490\u001b[0m     headers\u001b[39m=\u001b[39;49mmerge_setting(\n\u001b[1;32m    491\u001b[0m         request\u001b[39m.\u001b[39;49mheaders, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mheaders, dict_class\u001b[39m=\u001b[39;49mCaseInsensitiveDict\n\u001b[1;32m    492\u001b[0m     ),\n\u001b[1;32m    493\u001b[0m     params\u001b[39m=\u001b[39;49mmerge_setting(request\u001b[39m.\u001b[39;49mparams, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mparams),\n\u001b[1;32m    494\u001b[0m     auth\u001b[39m=\u001b[39;49mmerge_setting(auth, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mauth),\n\u001b[1;32m    495\u001b[0m     cookies\u001b[39m=\u001b[39;49mmerged_cookies,\n\u001b[1;32m    496\u001b[0m     hooks\u001b[39m=\u001b[39;49mmerge_hooks(request\u001b[39m.\u001b[39;49mhooks, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhooks),\n\u001b[1;32m    497\u001b[0m )\n\u001b[1;32m    498\u001b[0m \u001b[39mreturn\u001b[39;00m p\n",
      "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/models.py:368\u001b[0m, in \u001b[0;36mPreparedRequest.prepare\u001b[0;34m(self, method, url, headers, files, data, params, auth, cookies, hooks, json)\u001b[0m\n\u001b[1;32m    365\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Prepares the entire request with the given parameters.\"\"\"\u001b[39;00m\n\u001b[1;32m    367\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_method(method)\n\u001b[0;32m--> 368\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprepare_url(url, params)\n\u001b[1;32m    369\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_headers(headers)\n\u001b[1;32m    370\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_cookies(cookies)\n",
      "File \u001b[0;32m/usr/lib/python3.10/site-packages/requests/models.py:439\u001b[0m, in \u001b[0;36mPreparedRequest.prepare_url\u001b[0;34m(self, url, params)\u001b[0m\n\u001b[1;32m    436\u001b[0m     \u001b[39mraise\u001b[39;00m InvalidURL(\u001b[39m*\u001b[39me\u001b[39m.\u001b[39margs)\n\u001b[1;32m    438\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m scheme:\n\u001b[0;32m--> 439\u001b[0m     \u001b[39mraise\u001b[39;00m MissingSchema(\n\u001b[1;32m    440\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mInvalid URL \u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m!r}\u001b[39;00m\u001b[39m: No scheme supplied. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    441\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mPerhaps you meant http://\u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m}\u001b[39;00m\u001b[39m?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    442\u001b[0m     )\n\u001b[1;32m    444\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m host:\n\u001b[1;32m    445\u001b[0m     \u001b[39mraise\u001b[39;00m InvalidURL(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mInvalid URL \u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m!r}\u001b[39;00m\u001b[39m: No host supplied\u001b[39m\u001b[39m\"\u001b[39m)\n",
      "\u001b[0;31mMissingSchema\u001b[0m: Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?"
     ]
    }
   ],
   "source": [
-    "analysis_results = pd.read_pickle('analysis_results.pkl')\n",
+    "download_dataTMPsets()"
    "\n",
    "omega_results = pd.DataFrame(columns=['Graph', 'omega'])\n",
    "\n",
    "for G in checkins_graphs:\n",
    "    print(G.name)\n",
    "    C_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Clustering Coefficient'].values[0]\n",
    "    L_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Shortest Path Length'].values[0]\n",
    "\n",
    "    omega = omega(G, C_og, L_og, 2, 3)\n",
    "    \n",
    "    omega_results = omega_results.append({\n",
    "        'Graph': G.name,\n",
    "        'omega': omega\n",
    "    }, ignore_index=True)"
   ]
  },
  {
@ -390,17 +124,99 @@
   "metadata": {},
   "outputs": [],
   "source": [
    "for G in friendships_graphs:\n",
    "    print(G.name)\n",
    "    C_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Clustering Coefficient'].values[0]\n",
    "    L_og = analysis_results.loc[analysis_results['Graph'] == G.name, 'Average Shortest Path Length'].values[0]\n",
    "\n",
-    "    omega = omega(G, C_og, L_og, 2, 3)\n",
+    "def download_dataTMPsets():\n",
-    "    \n",
+    "\n",
-    "    omega_results = omega_results.append({\n",
+    "    urls = [\n",
-    "        'Graph': G.name,\n",
+    "        [\"https://snap.stanford.edu/dataTMP/loc-brightkite_edges.txt.gz\", \"https://snap.stanford.edu/dataTMP/loc-brightkite_totalCheckins.txt.gz\"],\n",
-    "        'omega': omega\n",
+    "        [\"https://snap.stanford.edu/dataTMP/loc-gowalla_edges.txt.gz\", \"https://snap.stanford.edu/dataTMP/loc-gowalla_totalCheckins.txt.gz\"],\n",
-    "    }, ignore_index=True)"
+    "        [\"https://drive.google.com/file/d/1PNk3zY8NjLcDiAbzjABzY5FiPAFHq6T8/view?usp=sharing\"]\n",
    "    ]\n",
    "\n",
    "    folders = [\"brightkite\", \"gowalla\", \"foursquare\"]\n",
    "\n",
    "    if not os.path.exists(\"dataTMP\"):\n",
    "        os.mkdir(\"dataTMP\")\n",
    "\n",
    "    for folder in folders:\n",
    "        if not os.path.exists(os.path.join(\"dataTMP\", folder)):\n",
    "            os.mkdir(os.path.join(\"dataTMP\", folder))\n",
    "\n",
    "    # Download every url in their respective folder. For the last one, we have to use gdown, because it's a google drive link. If the file is already downloaded, skip the download\n",
    "\n",
    "    for i in range(len(urls)):\n",
    "        for url in urls[i]:\n",
    "            if not os.path.exists(os.path.join(\"dataTMP\", folders[i], url.split(\"/\")[-1])):\n",
    "                if i == 2:\n",
    "                    output = os.path.join(\"dataTMP\", folders[i], \"something.zip\")\n",
    "                    gdown.download(url, output, quiet=False, fuzzy=True)\n",
    "                else:\n",
    "                    wget.download(url, os.path.join(\"dataTMP\", folders[i]))\n",
    "\n",
    "download_dataTMPsets()\n",
    "    # # unzip all the files in the 3 folders. Then remove the .gz or .zip files\n",
    "\n",
    "    # for folder in folders:\n",
    "        # for file in os.listdir(os.path.join(\"dataTMP\", folder)):\n",
    "            # print(folder, file)\n",
    "            # if file.endswith(\".gz\"):\n",
    "                # os.system(\"gunzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n",
    "            # elif file.endswith(\".zip\"):\n",
    "                # os.system(\"unzip {}\".format(os.path.join(\"dataTMP\", folder, file)))\n",
    "                # os.remove(os.path.join(\"dataTMP\", folder, file))\n",
    "\n",
    "    # # take all the .txt files from dataTMP/foursquare/dataTMPset_WWW2019 and move them to dataTMP/foursquare\n",
    "\n",
    "    # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\")):\n",
    "        # if file.endswith(\".txt\"):\n",
    "            # os.rename(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\", file), os.path.join(\"dataTMP\", \"foursquare\", file))\n",
    "\n",
    "    # # remove the dataTMPset_WWW2019 folder, note that is not empty\n",
    "    # # os.rmdir(os.path.join(\"dataTMP\", \"foursquare\", \"dataTMPset_WWW2019\"))\n",
    "\n",
    "    # for file in [\"dataTMPset_WWW_friendship_old.txt\", \"dataTMPset_WWW_readme.txt\", \"raw_Checkins_anonymized.txt\", \"raw_POIs.txt\"]:\n",
    "        # os.remove(os.path.join(\"dataTMP\", \"foursquare\", file))\n",
    "\n",
    "    # # Now we want to clean our dataTMP and rename the files.\n",
    "\n",
    "    # for file in os.listdir(os.path.join(\"dataTMP\", \"brightkite\")):\n",
    "        # if file.endswith(\"_edges.txt\"):\n",
    "            # os.rename(os.path.join(\"dataTMP\", \"brightkite\", file), os.path.join(\"dataTMP\", \"brightkite\", \"brightkite_friends_edges.txt\"))\n",
    "\n",
    "    # for file in os.listdir(os.path.join(\"dataTMP\", \"gowalla\")):\n",
    "        # if file.endswith(\"_edges.txt\"):\n",
    "            # os.rename(os.path.join(\"dataTMP\", \"gowalla\", file), os.path.join(\"dataTMP\", \"gowalla\", \"gowalla_friends_edges.txt\"))\n",
    "\n",
    "    # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\")):\n",
    "        # if file.endswith(\"dataTMPset_WWW_friendship_new.txt\"):\n",
    "            # os.rename(os.path.join(\"dataTMP\", \"foursquare\", file), os.path.join(\"dataTMP\", \"foursquare\", \"foursquare_friends_edges.txt\"))\n",
    "\n",
    "    # # Now we from the _totalCheckins.txt files we want to keep only the first and last column, which are the user ID and the venue ID. We also want to remove the header of the file.\n",
    "\n",
    "    # for file in os.listdir(os.path.join(\"dataTMP\", \"brightkite\")):\n",
    "        # if file.endswith(\"_totalCheckins.txt\"):\n",
    "            # df = pd.read_csv(os.path.join(\"dataTMP\", \"brightkite\", file), sep=\"\\t\", header=None, names=[\"user_id\", \"check-in time\", \"latitude\", \"longitude\", \"venue_id\"])\n",
    "            # df[\"check-in time\"] = pd.to_datetime(df[\"check-in time\"])\n",
    "            # df = df[df[\"check-in time\"].dt.year == 2010]\n",
    "            # df = df.drop([\"check-in time\", \"latitude\", \"longitude\"], axis=1)\n",
    "            # df.to_csv(os.path.join(\"dataTMP\", \"brightkite\", \"brightkite_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n",
    "            # os.remove(os.path.join(\"dataTMP\", \"brightkite\", file))\n",
    "\n",
    "    # for file in os.listdir(os.path.join(\"dataTMP\", \"gowalla\")):\n",
    "        # if file.endswith(\"_totalCheckins.txt\"):\n",
    "            # df = pd.read_csv(os.path.join(\"dataTMP\", \"gowalla\", file), sep=\"\\t\", header=None, names=[\"user_id\", \"check-in time\", \"latitude\", \"longitude\", \"venue_id\"])\n",
    "            # df[\"check-in time\"] = pd.to_datetime(df[\"check-in time\"])\n",
    "            # df = df[df[\"check-in time\"].dt.year == 2010]\n",
    "            # df = df.drop([\"check-in time\", \"latitude\", \"longitude\"], axis=1)\n",
    "            # df.to_csv(os.path.join(\"dataTMP\", \"gowalla\", \"gowalla_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n",
    "            # os.remove(os.path.join(\"dataTMP\", \"gowalla\", file))\n",
    "\n",
    "    # for file in os.listdir(os.path.join(\"dataTMP\", \"foursquare\")):\n",
    "        # if file.endswith(\"dataTMPset_WWW_Checkins_anonymized.txt\"):\n",
    "            # df = pd.read_csv(os.path.join(\"dataTMP\", \"foursquare\", file), sep=\"\\t\", header=None)\n",
    "            # df = df[[0, 1]]\n",
    "            # df.to_csv(os.path.join(\"dataTMP\", \"foursquare\", \"foursquare_checkins.txt\"), sep=\"\\t\", header=False, index=False, errors=\"ignore\", encoding=\"utf-8\")\n",
    "            # os.remove(os.path.join(\"dataTMP\", \"foursquare\", file))\n"
   ]
  }
 ],
--- a/utils.py
+++ b/utils.py
@ -126,7 +126,7 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq
    Parameters
    ----------
-    `dataset` : Literal['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT']
+    `dataset` : Literal['brightkite', 'gowalla', 'foursquare']
        The dataset to use.
    `create_file` : bool, optional
        If True, the graph is saved in a file, by default True
@ -142,84 +142,37 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq
    """
-    if dataset not in ['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT']:
+    if dataset not in ['brightkite', 'gowalla', 'foursquare']:
-        raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquareEU, foursquareUS, foursquareIT")
+        raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquare")
    if dataset in ['brightkite', 'gowalla']:
        file = os.path.join("data", dataset, dataset + "_checkins.txt")
-        print("\nCreating the graph for the dataset {}...".format(dataset))
+    file = os.path.join("data", dataset, dataset + "_checkins.txt")
-        df = pd.read_csv(file, sep="\t", header=None, names=["user_id", "venue_id"])
+    print("\nCreating the graph for the dataset {}...".format(dataset))
-        G = nx.Graph()
+    df = pd.read_csv(file, sep="\t", header=None, names=["user_id", "venue_id"], engine='pyarrow')
        venues_users = df.groupby("venue_id")["user_id"].apply(set)
-        for users in tqdm.tqdm(venues_users):
+    G = nx.Graph()
-            for user1, user2 in combinations(users, 2):
+    venues_users = df.groupby("venue_id")["user_id"].apply(set)
                G.add_edge(user1, user2)
-        # path to the file where we want to save the graph
+    for users in tqdm.tqdm(venues_users):
-        edges_path = os.path.join("data", dataset , dataset + "_checkins_edges.tsv")
+        for user1, user2 in combinations(users, 2):
            G.add_edge(user1, user2)
-        print("Done! The graph has {} edges".format(G.number_of_edges()), " and {} nodes".format(G.number_of_nodes()))
+    # path to the file where we want to save the graph
    edges_path = os.path.join("data", dataset , dataset + "_checkins_edges.tsv")
-        # delete from memory the dataframe
+    print("Done! The graph has {} edges".format(G.number_of_edges()), " and {} nodes".format(G.number_of_nodes()))
        del df
-        if create_file:
+    # delete from memory the dataframe
-            # save the graph in a file
+    del df
            nx.write_edgelist(G, edges_path, data=True, delimiter="\t", encoding="utf-8")
-        return G
+    if create_file:
        # save the graph in a file
        nx.write_edgelist(G, edges_path, data=True, delimiter="\t", encoding="utf-8")
-    else:
+    return G
        # path to the checkins file and the POIS file
        path_checkins = os.path.join("data", "foursquare", "foursquare_checkins.txt")
        path_POIS = os.path.join("data", "foursquare", "raw_POIs.txt")
        # dataframe with the checkins, we need only the user_id and the venue_id
        df_all = pd.read_csv(path_checkins, sep="\t", header=None, names=['user_id', 'venue_id', 'time', 'offset'])
        df_all = df_all[['user_id', 'venue_id']]
        # dataframe with the POIS, we need only the venue_id and the country code
        df_POIS = pd.read_csv(path_POIS, sep='\t', header=None, names=['venue_id', 'lat', 'lon', 'category', 'country code'])
        df_POIS = df_POIS[['venue_id', 'country code']]
        if dataset == "foursquareIT":
            venues_array = df_POIS[df_POIS['country code'] == 'IT']['venue_id'].values
        elif dataset == "foursquareEU":
            # list of the countries in the EU
            EU_countries = ['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'MT', 'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK']
            venues_array = df_POIS[df_POIS['country code'].isin(EU_countries)]['venue_id'].values
        print("\nCreating the graph for the dataset {}...".format(dataset))
        # we create a dataframe with the checkins in the corresponding country
        df_country = df_all[df_all['venue_id'].isin(venues_array)]
        G = nx.Graph()
        venues_users = df_country.groupby("venue_id")["user_id"].apply(set)
        for users in tqdm.tqdm(venues_users):
            for user1, user2 in combinations(users, 2):
                G.add_edge(user1, user2)
        # path to the file where we want to save the graph
        edges_path = os.path.join("data", "foursquare", dataset + "_checkins_edges.tsv")
        print("Done! The graph has {} edges".format(G.number_of_edges()), " and {} nodes".format(G.number_of_nodes()))
        # delete from memory the dataframes
        del df_all, df_POIS, df_country
        if create_file:
            # save the graph in a file
            nx.write_edgelist(G, edges_path, data=True, delimiter="\t", encoding="utf-8")
        return G
 # ------------------------------------------------------------------------#
@ -244,56 +197,28 @@ def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursqua
    Since we are taking sub-samples of each check-ins dataset, we are also taking sub-samples of the friendship graph. A user is included in the friendship graph if he has at least one check-in in the sub-sample.
    """
-    if dataset not in ["brightkite", "gowalla", "foursquareEU", "foursquareIT"]:
+    if dataset not in ["brightkite", "gowalla", "foursquare"]:
        raise ValueError("The dataset must be brightkite, gowalla or foursquare")
    if dataset in ["foursquareEU", "foursquareIT"]:
        file = os.path.join("data", "foursquare", "foursquare_friends_edges.txt")
        # dataframe with the edges of the graph (friends)
        df_friends_all = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
        # set of the unique users in the graph (friends)
        unique_friends = set(df_friends_all["node1"].unique()).union(set(df_friends_all["node2"].unique()))
        # dataframe with the edges of the graph (checkins)
        df_checkins = pd.read_csv(os.path.join("data", "foursquare", dataset + "_checkins_edges.tsv"), sep="\t", header=None, names=["node1", "node2"])
        unique_checkins = set(df_checkins["node1"].unique()).union(set(df_checkins["node2"].unique()))
        # take the intersection of the two sets
        unique_users = unique_friends.intersection(unique_checkins)
-        # create a dataframe with the edges of the graph
+    file = os.path.join("data", dataset, dataset + "_friends_edges.txt")
        df = df_friends_all[df_friends_all["node1"].isin(unique_users) & df_friends_all["node2"].isin(unique_users)]
-        # create a tsv file with the edges of the graph that ends with _filtered.tsv
+    df_friends_all = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"], engine='pyarrow')
-        df.to_csv(os.path.join("data", "foursquare", dataset + "_friends_edges_filtered.tsv"), sep="\t", header=False, index=False)
+    unique_friends = set(df_friends_all["node1"].unique()).union(set(df_friends_all["node2"].unique()))
-        # create the graph
+    df_checkins = pd.read_csv(os.path.join("data", dataset, dataset + "_checkins_edges.tsv"), sep="\t", header=None, names=["node1", "node2"])
-        G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
+    unique_checkins = set(df_checkins["node1"].unique()).union(set(df_checkins["node2"].unique()))
        del df_friends_all, df_checkins, df
-        return G
+    unique_users = unique_friends.intersection(unique_checkins)
-    elif dataset in ["brightkite", "gowalla"]:
+    df = df_friends_all[df_friends_all["node1"].isin(unique_users) & df_friends_all["node2"].isin(unique_users)]
        file = os.path.join("data", dataset, dataset + "_friends_edges.txt")
-        df_friends_all = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
+    df.to_csv(os.path.join("data", dataset, dataset + "_friends_edges_filtered.tsv"), sep="\t", header=False, index=False)
        unique_friends = set(df_friends_all["node1"].unique()).union(set(df_friends_all["node2"].unique()))
-        df_checkins = pd.read_csv(os.path.join("data", dataset, dataset + "_checkins_edges.tsv"), sep="\t", header=None, names=["node1", "node2"])
+    G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
-        unique_checkins = set(df_checkins["node1"].unique()).union(set(df_checkins["node2"].unique()))
+    del df_friends_all, df_checkins, df
-        unique_users = unique_friends.intersection(unique_checkins)
+    return G
        df = df_friends_all[df_friends_all["node1"].isin(unique_users) & df_friends_all["node2"].isin(unique_users)]
        df.to_csv(os.path.join("data", dataset, dataset + "_friends_edges_filtered.tsv"), sep="\t", header=False, index=False)
        G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
        del df_friends_all, df_checkins, df
        return G
 # ------------------------------------------------------------------------#
@ -484,7 +409,7 @@ def average_shortest_path(G: nx.Graph, k=None) -> float:
        if k is not None and (k < 0 or k > 1):
            raise ValueError("k must be between 0 and 1")
        elif k is None:
-            G = G.copy()
+            G_copy = G.copy()
            connected_components = list(nx.connected_components(G))
        else:
            G_copy = G.copy()