C++ script that creates the checkins graph. Working, needs to be implemented with argparse-like something and multithreading

2 years ago · d243a28faf
parent 825030a26e
commit d243a28faf
5 changed files with 390 additions and 613 deletions
--- a/.gitignore
+++ b/.gitignore
@ -132,7 +132,6 @@ dmypy.json
 data/
 # Latex stuff
 *.aux
 *.bbl
 *.blg
@ -143,3 +142,5 @@ data/
 *.synctex.gz
 *.toc
 *.txt
 .vscode/
--- a/bob_the_builder.cpp
+++ b/bob_the_builder.cpp
@ -0,0 +1,124 @@
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <mutex>
 #include <thread>
 #include <functional>
 #include <set>
 using namespace std;
 // It receives the file name as a string and returns a dictionary with the keys being the UserID and the values being a vector of VenueID associated with that UserID.
 unordered_map<string, set<string>> createDictFromFile(string filename) {
    // Create an empty dictionary
    unordered_map<string, set<string>> dict;
    // Open the file
    ifstream file(filename);
    // Check if the file was opened successfully
    if (!file.is_open()) {
        cerr << "Error opening file " << filename << endl;
        return dict;
    }
    // Read the file line by line
    string userId, venueId;
    while (file.good()) {
        file >> userId >> venueId;
        // Add the venueId to the vector of venues associated with the userId
        dict[userId].insert(venueId);
    }
    cout << "Dict created" << endl;
    // Return the dictionary
    return dict;
    }
 // void create_tsv_multi(unordered_map<string, vector<string>> dict, mutex& dict_mutex) {
 //       // Create an output stream to write the file
 //       ofstream out_file("output.tsv");
 //       // Create a mutex to protect the output file
 //       mutex out_file_mutex;
 //       // Loop over all the key-value pairs in the map
 //       for (const auto& kv1 : dict) {
 //         for (const auto& kv2 : dict) {
 //           // Check if the keys are the same
 //           if (kv1.first == kv2.first) continue;
 //           // Check if the values have elements in common
 //           vector<string> common;
 //           for (const auto& str1 : kv1.second) {
 //             for (const auto& str2 : kv2.second) {
 //               if (str1 == str2) common.push_back(str1);
 //             }
 //           }
 //           // Write the keys and the number of common elements to the output file
 //           if (!common.empty()) {
 //             // Lock the mutexes before accessing the dict and the output file
 //             lock_guard<mutex> dict_guard(dict_mutex);
 //             lock_guard<mutex> out_file_guard(out_file_mutex);
 //             out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
 //           }
 //         }
 //       }
 //     }
 void create_tsv(unordered_map<string, set<string>> dict) {
    // Create an output stream to write the file
    ofstream out_file("data/foursquare/foursquareTKY_checkins_graph.tsv");
    // Loop over all the key-value pairs in the map
    unsigned long long i = 0;
    for (const auto& kv1 : dict) {
        i++;
        if (i%100 == 0) cout << (((double)i) * 100 / dict.size()) << "%" << "\r" << flush;
        for (const auto& kv2 : dict) {
            // Check if the keys are the same
            if(kv1.first >= kv2.first) continue;
            // Check if the values have elements in common
            set<string> common;
            set_intersection(kv1.second.begin(), kv1.second.end(), kv2.second.begin(), kv2.second.end(), inserter(common, common.begin()));
            // Write the keys and the number of common elements to the output file
            if (!common.empty()) {
                out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
                // cout << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
            }
        }
    }
 }
 void print_help() {
  cout << "Usage: ./main [IN_FILE] [OUT_FILE]" << endl;
 }
 int main() {
    unordered_map<string, set<string>> dict = createDictFromFile("data/foursquare/foursquare_checkins_TKY.txt");
    create_tsv(dict);
 }
 // int main(int argc, const char* argv[]) {
 //     if (argc == 3) {
 //         string in_file = argv[1];
 //         string out_file = argv[2];
 //     if (in_file == "-h" || in_file == "--help" || out_file == "-h" || out_file == "--help") {
 //         print_help();
 //         return 0;
 //         }
 //     } else {
 //         print_help();
 //         return 0;
 //     }
 //     unordered_map<string, set<string>> dict = createDictFromFile("test.txt");
 //     create_tsv(dict);
 // }
--- a/main.py
+++ b/main.py
@ -1,14 +1,3 @@
 # /bin/env/python3
 from itertools import combinations
 import os
 import wget
 import zipfile
 import networkx as nx
 import pandas as pd
 from typing import Literal
 """
 NOTEs:
@ -17,6 +6,15 @@ NOTEs:
 - Why do I use os.path.join and not the "/"? Because it's more portable, it works on every OS, while "/" works only on Linux and Mac. If you want to use it on Windows, you have to change all the "/" with "\". With os.path.join you don't have to worry about it and, as always, f*** Microsoft.
 """
 import os
 import wget
 import zipfile
 import pandas as pd
 import networkx as nx
 from typing import Literal
 from itertools import combinations
 # ------------------------------------------------------------------------#
 def download_datasets():
@ -39,23 +37,21 @@ def download_datasets():
        if not os.path.exists(os.path.join("data", folder)):
            os.mkdir(os.path.join("data", folder))
-    # download every url in urls[0] in the brightkite folder, and every url in urls[1] in the gowalla folder, and every url in urls[2] in the foursquare folder. At ech iteration, it checks if the file already exists, if yes, it skips the download and prints a message. If no, it downloads the file and prints a message.
+    # download every url in urls[0] in the brightkite folder, and every url in urls[1] in the gowalla folder, and every url in urls[2] in the foursquare folder. If the file is already downloaded, skip the download
    for i in range(len(urls)):
        for url in urls[i]:
-            # check if there are .txt files inside folder, if yes, skip the download
+            if not os.path.exists(os.path.join("data", folders[i], url.split("/")[-1])):
            if len([file for file in os.listdir(os.path.join("data", folders[i])) if file.endswith(".txt")]) > 0:
                print("The {} dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder".format(folders[i]))
                break
            # check if there are .gz files inside folder, if yes, skip the download
            elif len([file for file in os.listdir(os.path.join("data", folders[i])) if file.endswith(".gz")]) > 0:
                print("The {} dataset is already downloaded as .gz file, if you want to download again the .gz file with this function, delete the .gz files in the folder".format(folders[i]))
                break
            # if there are no .txt or .gz files, download the file
            else:
                print("Downloading {} dataset...".format(folders[i]))
                wget.download(url, os.path.join("data", folders[i]))
-                print("Download completed of {} dataset".format(folders[i]))
+
    # unzip the .gz files inside the brightkite and gowalla folders
    for file in os.listdir(os.path.join("data", "brightkite")):
        if file.endswith(".gz"):
            os.system("gunzip {}".format(os.path.join("data", "brightkite", file)))
    for file in os.listdir(os.path.join("data", "gowalla")):
        if file.endswith(".gz"):
            os.system("gunzip {}".format(os.path.join("data", "gowalla", file)))
    # extract the data of foursquare in a nice way, checking all the edge cases as a maniac. More details below
@ -84,20 +80,50 @@ def download_datasets():
                    os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file))
                os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014"))
-    # if there are no .txt files inside the brightkite folder, unzip the .gz files
+    # Now we want to clean our data. Both for brightkite and gowalla, we want to rename _edges files as "brightkite_friends_edges.txt" and "gowalla_friends_edges.txt"
-    if len([file for file in os.listdir(os.path.join("data", "brightkite")) if file.endswith(".txt")]) == 0:
+
    for file in os.listdir(os.path.join("data", "brightkite")):
-            if file.endswith(".gz"):
+        if file.endswith("_edges.txt"):
-                os.system("gunzip {}".format(os.path.join("data", "brightkite", file)))
+            os.rename(os.path.join("data", "brightkite", file), os.path.join("data", "brightkite", "brightkite_friends_edges.txt"))
    # if there are no .txt files inside the gowalla folder, unzip the .gz files
    if len([file for file in os.listdir(os.path.join("data", "gowalla")) if file.endswith(".txt")]) == 0:
    for file in os.listdir(os.path.join("data", "gowalla")):
-            if file.endswith(".gz"):
+        if file.endswith("_edges.txt"):
-                os.system("gunzip {}".format(os.path.join("data", "gowalla", file)))
+            os.rename(os.path.join("data", "gowalla", file), os.path.join("data", "gowalla", "gowalla_friends_edges.txt"))
    # Now we from the _totalCheckins.txt files we want to keep only the first and last column, which are the user ID and the venue ID. We also want to remove the header of the file. Use pandas to do that. Then rename the files as "brightkite_checkins_edges.txt" and "gowalla_checkins_edges.txt
-def create_checkins_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY'])-> nx.Graph:
+    for file in os.listdir(os.path.join("data", "brightkite")):
        if file.endswith("_totalCheckins.txt"):
            df = pd.read_csv(os.path.join("data", "brightkite", file), sep="\t", header=None)
            df = df[[0, 2]]
            df.to_csv(os.path.join("data", "brightkite", "brightkite_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8")
            os.remove(os.path.join("data", "brightkite", file))
    for file in os.listdir(os.path.join("data", "gowalla")):
        if file.endswith("_totalCheckins.txt"):
            df = pd.read_csv(os.path.join("data", "gowalla", file), sep="\t", header=None)
            df = df[[0, 2]]
            df.to_csv(os.path.join("data", "gowalla", "gowalla_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8")
            os.remove(os.path.join("data", "gowalla", file))
    # now for foursquare we want to keep only the first and second column, which are the user ID and the venue ID. We also want to remove the header of the file. Use pandas to do that. Do that for both _NYC.txt and _TKY.txt files. Then rename the files as "foursquare_checkins_edges_NYC.txt" and "foursquare_checkins_edges_TKY.txt
    for file in os.listdir(os.path.join("data", "foursquare")):
        if file.endswith("_NYC.txt"):
            df = pd.read_csv(os.path.join("data", "foursquare", file), sep="\t", header=None, encoding="utf-8", encoding_errors="ignore")
            df = df[[0, 1]]
            df.to_csv(os.path.join("data", "foursquare", "foursquare_checkins_NYC.txt"), sep="\t", header=False, index=False)
            os.remove(os.path.join("data", "foursquare", file))
        if file.endswith("_TKY.txt"):
            df = pd.read_csv(os.path.join("data", "foursquare", file), sep="\t", header=None, encoding="utf-8", encoding_errors="ignore")
            df = df[[0, 1]]
            df.to_csv(os.path.join("data", "foursquare", "foursquare_checkins_TKY.txt"), sep="\t", header=False, index=False)
            os.remove(os.path.join("data", "foursquare", file))
 # ------------------------------------------------------------------------#
 def create_checkins_graph_SLOW(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY'])-> nx.Graph:
    """
    This function takes in input a tsv file, each line in the file is a check-in. The function returns an undirected networkx graph object.
@ -136,6 +162,8 @@ def create_checkins_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareN
    print("Number of edges added to the graph {}: {}".format(dataset, G.number_of_edges()))
 # ------------------------------------------------------------------------#
 def friendships_graph(dataset: Literal['brightkite', 'gowalla']) -> nx.Graph:
    """
@ -152,7 +180,41 @@ def friendships_graph(dataset: Literal['brightkite', 'gowalla']) -> nx.Graph:
    if dataset not in ["brightkite", "gowalla"]:
        raise ValueError("The dataset must be brightkite or gowalla")
-    file = os.path.join("data", dataset, "loc-{}_edges.txt".format(dataset))
+    file = os.path.join("data", dataset, "{}_friends_edges.txt".format(dataset))
    df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
    G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
    return G
 # ------------------------------------------------------------------------#
 def checkins_graph_from_edges(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY']) -> nx.Graph:
    """
    This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. It uses pandas to read the file since it's faster than the standard python open() function. If we don't want to use the standard python open() function, the following code works as well:
    G = nx.Graph()
    with open(file, "r") as f:
        for line in f:
            node1, node2 = line.split("\t")
            G.add_edge(node1, node2)
    """
    if dataset not in ["brightkite", "gowalla", "foursquareNYC", "foursquareTKY"]:
        raise ValueError("The dataset must be brightkite, gowalla or foursquare")
    file = os.path.join("data", dataset, "{}_checkins_edges.tsv".format(dataset))
    # if dataset == "foursquareTKY":
    #     file = os.path.join("data", "foursquare", "foursquareNYC_checkins_graph.tsv")
    # elif dataset == "foursquareNYC":
    #     file = os.path.join("data", "foursquare", "foursquareTKY_checkins_graph.tsv")
    # else:
    #     file = os.path.join("data", dataset, "{}_checkins_graph.tsv".format(dataset))
    df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
    G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
--- a/test.cpp
+++ b/test.cpp
@ -1,108 +0,0 @@
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <mutex>
 #include <thread>
 #include <functional>
 using namespace std;
 // It receives the file name as a string and returns a dictionary with the keys being the UserID and the values being a vector of VenueID associated with that UserID.
 unordered_map<string, vector<string>> createDictFromFile(string filename) {
  // Create an empty dictionary
  unordered_map<string, vector<string>> dict;
  // Open the file
  ifstream file(filename);
  // Check if the file was opened successfully
  if (!file.is_open()) {
    cerr << "Error opening file " << filename << endl;
    return dict;
  }
  // Read the file line by line
  string userId, venueId;
  while (file >> userId >> venueId) {
    // Check if the userId is already in the dictionary
    if (dict.count(userId) == 0) {
      // If not, create an entry in the dictionary with an empty vector of venues
      dict[userId] = vector<string>();
    }
    // Add the venueId to the vector of venues associated with the userId
    dict[userId].push_back(venueId);
  }
  // Close the file
  file.close();
  cout << "Dict created" << endl;
  // Return the dictionary
  return dict;
 }
 void create_tsv(unordered_map<string, vector<string>> dict, mutex& dict_mutex) {
  // Create an output stream to write the file
  ofstream out_file("output.tsv");
  // Create a mutex to protect the output file
  mutex out_file_mutex;
  // Loop over all the key-value pairs in the map
  for (const auto& kv1 : dict) {
    for (const auto& kv2 : dict) {
      // Check if the keys are the same
      if (kv1.first == kv2.first) continue;
      // Check if the values have elements in common
      vector<string> common;
      for (const auto& str1 : kv1.second) {
        for (const auto& str2 : kv2.second) {
          if (str1 == str2) common.push_back(str1);
        }
      }
      // Write the keys and the number of common elements to the output file
      if (!common.empty()) {
        // Lock the mutexes before accessing the dict and the output file
        lock_guard<mutex> dict_guard(dict_mutex);
        lock_guard<mutex> out_file_guard(out_file_mutex);
        out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
      }
    }
  }
  // Close the output file
  out_file.close();
 }
 int main() {
  // Create a map of vectors
  unordered_map<string, vector<string>> dict = createDictFromFile("test.txt");
  // Create a mutex to protect the dict map
  mutex dict_mutex;
  // Create an array of threads
  const int num_threads = 12;
  thread threads[num_threads];
  // Launch the threads
  for (int i = 0; i < num_threads; i++) {
    threads[i] = thread(create_tsv, ref(dict), ref(dict_mutex));
  }
  // Wait for the threads to finish
  for (int i = 0; i < num_threads; i++) {
    threads[i].join();
  }
  return 0;
 }
--- a/testing.ipynb
+++ b/testing.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -24,6 +24,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -36,42 +37,33 @@
    "We can download the datasets using the function `download_dataset` from the `utils` module. It will download the datasets in the `data` folder, organized in sub-folders in the following way:\n",
    "\n",
    "```\n",
    "data/\n",
    "├── brightkite\n",
-    "│   ├── loc-brightkite_edges.txt.gz\n",
+    "│   ├── brightkite_checkins.txt\n",
-    "│   ├── loc-brightkite_totalCheckins.txt.gz\n",
+    "│   └── brightkite_friends_edges.txt\n",
    "├── foursquare\n",
-    "│   ├── loc-gowalla_edges.txt.gz\n",
+    "│   ├── foursquare_checkins_NYC.txt\n",
-    "│   ├── loc-gowalla_totalCheckins.txt.gz\n",
+    "│   ├── foursquare_checkins_TKY.txt\n",
    "└── gowalla\n",
-    "    ├── dataset_ubicomp2013_checkins.txt\n",
+    "    ├── gowalla_checkins.txt\n",
-    "    ├── dataset_ubicomp2013_tags.txt\n",
+    "    └── gowalla_friends_edges.txt\n",
    "    └── dataset_ubicomp2013_tips.txt\n",
    "```\n",
    "\n",
-    "If any of the datasets is already downloaded, it will not be downloaded again. For futher details about the function below, please refer to the `utils` module."
+    "If any of the datasets is already downloaded, it will not be downloaded again. For further details about the function below, please refer to the `utils` module.\n",
    "\n",
    "> NOTE: the Stanford servers tends to be slow, so it may take a while to download the datasets. It's gonna take about 2 to 3 minutes to download all the datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The brightkite dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n",
      "The gowalla dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n",
      "The foursquare dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n"
     ]
    }
   ],
   "source": [
    "download_datasets()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -79,552 +71,258 @@
    "\n",
    "## Brightkite\n",
    "\n",
-    "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. The network was originally directed but the authors of the dataset have constructed a network with undirected edges when there is a friendship in both ways. They also have also collected a total of `4491143` checking of these users over the period of Apr. 2008 - Oct. 2010.\n",
+    "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. We will work with two different datasets:\n",
    "\n",
-    "Here is an example of check-in information"
+    "- `data/brightkite/brightkite_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids\n",
    "- `data/brightkite/brightkite_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph."
   ]
  },
  {
-   "cell_type": "code",
+   "attachments": {},
-   "execution_count": 3,
+   "cell_type": "markdown",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>check-in time</th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "      <th>location_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-17T01:48:53Z</td>\n",
       "      <td>39.747652</td>\n",
       "      <td>-104.992510</td>\n",
       "      <td>88c46bf20db295831bd2d1718ad7e6f5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-16T06:02:04Z</td>\n",
       "      <td>39.891383</td>\n",
       "      <td>-105.070814</td>\n",
       "      <td>7a0f88982aa015062b95e3b4843f9ca2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-16T03:48:54Z</td>\n",
       "      <td>39.891077</td>\n",
       "      <td>-105.068532</td>\n",
       "      <td>dd7cd3d264c2d063832db506fba8bf79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-14T18:25:51Z</td>\n",
       "      <td>39.750469</td>\n",
       "      <td>-104.999073</td>\n",
       "      <td>9848afcc62e500a01cf6fbf24b797732f8963683</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-14T00:21:47Z</td>\n",
       "      <td>39.752713</td>\n",
       "      <td>-104.996337</td>\n",
       "      <td>2ef143e12038c870038df53e0478cefc</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user         check-in time   latitude   longitude  \\\n",
       "0     0  2010-10-17T01:48:53Z  39.747652 -104.992510   \n",
       "1     0  2010-10-16T06:02:04Z  39.891383 -105.070814   \n",
       "2     0  2010-10-16T03:48:54Z  39.891077 -105.068532   \n",
       "3     0  2010-10-14T18:25:51Z  39.750469 -104.999073   \n",
       "4     0  2010-10-14T00:21:47Z  39.752713 -104.996337   \n",
       "\n",
       "                                location_id  \n",
       "0          88c46bf20db295831bd2d1718ad7e6f5  \n",
       "1          7a0f88982aa015062b95e3b4843f9ca2  \n",
       "2          dd7cd3d264c2d063832db506fba8bf79  \n",
       "3  9848afcc62e500a01cf6fbf24b797732f8963683  \n",
       "4          2ef143e12038c870038df53e0478cefc  "
      ]
     },
     "execution_count": 3,
   "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "brightkite_path = os.path.join(\"data\", \"brightkite\", \"loc-brightkite_totalCheckins.txt\")\n",
+    "## Gowalla\n",
-    "Brightkite_df = pd.read_csv(brightkite_path, sep=\"\\t\", header=None, names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
+    "\n",
    "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. As for Brightkite, we will work with two different datasets:\n",
    "\n",
-    "Brightkite_df.head()"
+    "- `data/gowalla/gowalla_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids\n",
    "- `data/gowalla/gowalla_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Gowalla\n",
+    "## Foursquare\n",
    "\n",
    "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 10 months) check-in data in New York city and Tokyo collected from Foursquare from 12 April 2012 to 16 February 2013. It contains two files in tsv format. Each file contains 2 columns, which are:\n",
    "\n",
-    "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. The authors have collected a total of `6442890` check-ins of these users over the period of Feb. 2009 - Oct. 2010.\n",
+    "1. User ID (anonymized)\n",
    "2. Venue ID (Foursquare)\n",
    "\n",
-    "Here is an example of check-in information"
+    "In this case, we don't have any information about the friendship network, so we will only work with the checkins."
   ]
  },
  {
-   "cell_type": "code",
+   "cell_type": "markdown",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>check-in time</th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "      <th>location_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-19T23:55:27Z</td>\n",
       "      <td>30.235909</td>\n",
       "      <td>-97.795140</td>\n",
       "      <td>22847</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-18T22:17:43Z</td>\n",
       "      <td>30.269103</td>\n",
       "      <td>-97.749395</td>\n",
       "      <td>420315</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-17T23:42:03Z</td>\n",
       "      <td>30.255731</td>\n",
       "      <td>-97.763386</td>\n",
       "      <td>316637</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-17T19:26:05Z</td>\n",
       "      <td>30.263418</td>\n",
       "      <td>-97.757597</td>\n",
       "      <td>16516</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>2010-10-16T18:50:42Z</td>\n",
       "      <td>30.274292</td>\n",
       "      <td>-97.740523</td>\n",
       "      <td>5535878</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user         check-in time   latitude  longitude  location_id\n",
       "0     0  2010-10-19T23:55:27Z  30.235909 -97.795140        22847\n",
       "1     0  2010-10-18T22:17:43Z  30.269103 -97.749395       420315\n",
       "2     0  2010-10-17T23:42:03Z  30.255731 -97.763386       316637\n",
       "3     0  2010-10-17T19:26:05Z  30.263418 -97.757597        16516\n",
       "4     0  2010-10-16T18:50:42Z  30.274292 -97.740523      5535878"
      ]
     },
     "execution_count": 4,
   "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "gowalla_path = os.path.join(\"data\", \"gowalla\", \"loc-gowalla_totalCheckins.txt\")\n",
+    "# Building the networks"
    "\n",
    "Gowalla_df = pd.read_csv(gowalla_path, sep=\"\\t\", header=None, names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
    "\n",
    "Gowalla_df.head() "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Foursquare\n",
+    "We are asked to construct the networks for the three datasets as un undirected graph $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n",
    "\n",
    "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 10 months) check-in data in New York city and Tokyo collected from Foursquare from 12 April 2012 to 16 February 2013. It contains two files in tsv format. Each file contains 8 columns, which are:\n",
    "\n",
-    "1. User ID (anonymized)\n",
+    "And this is were the fun begins! The check-ins files of the three datasets are not in the form of a graph edge list, so we need to manipulate them. But those datasets are huge! Let's have a look at the number of lines of each file."
    "2. Venue ID (Foursquare)\n",
    "3. Venue category ID (Foursquare)\n",
    "4. Venue category name (Foursquare)\n",
    "5. Latitude\n",
    "6. Longitude\n",
    "7. Timezone offset in minutes (The offset in minutes between when this check-in occurred and the same time in UTC)\n",
    "8. UTC time\n",
    "\n",
    "Here is an example of check-in information from the New York dataset:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
-     "data": {
+     "name": "stdout",
-      "text/html": [
+     "output_type": "stream",
-       "<div>\n",
+     "text": [
-       "<style scoped>\n",
+      "gowalla\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
+      "Number of lines:  6442892\n",
-       "        vertical-align: middle;\n",
+      "Number of unique elements:  107092\n",
-       "    }\n",
+      "\n",
-       "\n",
+      "brightkite\n",
-       "    .dataframe tbody tr th {\n",
+      "Number of lines:  4747287\n",
-       "        vertical-align: top;\n",
+      "Number of unique elements:  51406\n",
-       "    }\n",
+      "\n",
-       "\n",
+      "foursquare\n",
-       "    .dataframe thead th {\n",
+      "Number of lines:  227428\n",
-       "        text-align: right;\n",
+      "Number of unique elements:  1083\n",
-       "    }\n",
+      "\n",
-       "</style>\n",
+      "foursquare\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
+      "Number of lines:  573703\n",
-       "  <thead>\n",
+      "Number of unique elements:  2293\n",
-       "    <tr style=\"text-align: right;\">\n",
+      "\n"
       "      <th></th>\n",
       "      <th>UserID</th>\n",
       "      <th>VenueID</th>\n",
       "      <th>CategoryID</th>\n",
       "      <th>CategoryName</th>\n",
       "      <th>Latitude</th>\n",
       "      <th>Longitude</th>\n",
       "      <th>Timezone offset in minutes</th>\n",
       "      <th>UTC time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>470</td>\n",
       "      <td>49bbd6c0f964a520f4531fe3</td>\n",
       "      <td>4bf58dd8d48988d127951735</td>\n",
       "      <td>Arts &amp; Crafts Store</td>\n",
       "      <td>40.719810</td>\n",
       "      <td>-74.002581</td>\n",
       "      <td>-240</td>\n",
       "      <td>Tue Apr 03 18:00:09 +0000 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>979</td>\n",
       "      <td>4a43c0aef964a520c6a61fe3</td>\n",
       "      <td>4bf58dd8d48988d1df941735</td>\n",
       "      <td>Bridge</td>\n",
       "      <td>40.606800</td>\n",
       "      <td>-74.044170</td>\n",
       "      <td>-240</td>\n",
       "      <td>Tue Apr 03 18:00:25 +0000 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>69</td>\n",
       "      <td>4c5cc7b485a1e21e00d35711</td>\n",
       "      <td>4bf58dd8d48988d103941735</td>\n",
       "      <td>Home (private)</td>\n",
       "      <td>40.716162</td>\n",
       "      <td>-73.883070</td>\n",
       "      <td>-240</td>\n",
       "      <td>Tue Apr 03 18:02:24 +0000 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>395</td>\n",
       "      <td>4bc7086715a7ef3bef9878da</td>\n",
       "      <td>4bf58dd8d48988d104941735</td>\n",
       "      <td>Medical Center</td>\n",
       "      <td>40.745164</td>\n",
       "      <td>-73.982519</td>\n",
       "      <td>-240</td>\n",
       "      <td>Tue Apr 03 18:02:41 +0000 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>87</td>\n",
       "      <td>4cf2c5321d18a143951b5cec</td>\n",
       "      <td>4bf58dd8d48988d1cb941735</td>\n",
       "      <td>Food Truck</td>\n",
       "      <td>40.740104</td>\n",
       "      <td>-73.989658</td>\n",
       "      <td>-240</td>\n",
       "      <td>Tue Apr 03 18:03:00 +0000 2012</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   UserID                   VenueID                CategoryID  \\\n",
       "0     470  49bbd6c0f964a520f4531fe3  4bf58dd8d48988d127951735   \n",
       "1     979  4a43c0aef964a520c6a61fe3  4bf58dd8d48988d1df941735   \n",
       "2      69  4c5cc7b485a1e21e00d35711  4bf58dd8d48988d103941735   \n",
       "3     395  4bc7086715a7ef3bef9878da  4bf58dd8d48988d104941735   \n",
       "4      87  4cf2c5321d18a143951b5cec  4bf58dd8d48988d1cb941735   \n",
       "\n",
       "          CategoryName   Latitude  Longitude  Timezone offset in minutes  \\\n",
       "0  Arts & Crafts Store  40.719810 -74.002581                        -240   \n",
       "1               Bridge  40.606800 -74.044170                        -240   \n",
       "2       Home (private)  40.716162 -73.883070                        -240   \n",
       "3       Medical Center  40.745164 -73.982519                        -240   \n",
       "4           Food Truck  40.740104 -73.989658                        -240   \n",
       "\n",
       "                         UTC time  \n",
       "0  Tue Apr 03 18:00:09 +0000 2012  \n",
       "1  Tue Apr 03 18:00:25 +0000 2012  \n",
       "2  Tue Apr 03 18:02:24 +0000 2012  \n",
       "3  Tue Apr 03 18:02:41 +0000 2012  \n",
       "4  Tue Apr 03 18:03:00 +0000 2012  "
     ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "foursquare_NYC_path = ny = os.path.join(\"data\", \"foursquare\", \"dataset_TSMC2014_NYC.txt\")\n",
+    "def count_lines_and_unique_elements(file):\n",
-    "foursquare_TKY_path = ny = os.path.join(\"data\", \"foursquare\", \"dataset_TSMC2014_TKY.txt\")\n",
+    "    df = pd.read_csv(file, sep='\\t', header=None)\n",
-    "\n",
+    "    print('Number of lines: ', len(df))\n",
-    "foursquare_NYC_df = pd.read_csv(foursquare_NYC_path, sep=\"\\t\", header=None, names=[\"UserID\", \"VenueID\", \"CategoryID\", \"CategoryName\", \"Latitude\", \"Longitude\", \"Timezone offset in minutes\", \"UTC time\"], encoding=\"utf-8\", encoding_errors=\"ignore\")\n",
+    "    print('Number of unique elements: ', len(df[0].unique()))\n",
-    "\n",
+    "\n",
-    "foursquare_NYC_df.head()"
+    "gowalla_path = os.path.join('data', 'gowalla', 'gowalla_checkins.txt')\n",
    "brightkite_path = os.path.join('data', 'brightkite', 'brightkite_checkins.txt')\n",
    "foursquareNYC_path = os.path.join('data', 'foursquare', 'foursquare_checkins_NYC.txt')\n",
    "foursquareTKY_path = os.path.join('data', 'foursquare', 'foursquare_checkins_TKY.txt')\n",
    "\n",
    "_ = [gowalla_path, brightkite_path, foursquareNYC_path, foursquareTKY_path]\n",
    "\n",
    "for path in _:\n",
    "    print(path.split(os.sep)[-2])\n",
    "    count_lines_and_unique_elements(path)\n",
    "    print()"
   ]
  },
  {
-   "cell_type": "code",
+   "attachments": {},
-   "execution_count": 6,
+   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# remove from memory, they were created only for aesthetic purposes in the notebook\n",
+    "We would like to build a graph starting from an edge list. So the basic idea is to create a dictionary where the keys are the unique users and the values are the locations that they visited. Then, we can iterate over the dictionary and create the edges.\n",
    "\n",
    "But, even if we avoids repetitions, the time complexity will be $O(n^2)$, where $n$ is the number of users. And since $n$ is in the order of millions, doing this in python, where we have to build nested for loops, it's a no-go. We need to find a faster way to do this.\n",
    "\n",
    "In the `utils` module I provided anyway a function that does exactly this, but I do not raccomend to use it unless you have countless hours of time spare. It's called `create_checkicreate_checkins_graph_SLOW` and it takes a dataset name as input and returns a networkx graph object. \n",
    "\n",
    "SCRIVERE QUALCOSA RIGUARDO LA FUNZIONE IN C++\n",
    "\n",
-    "del Brightkite_df\n",
+    "The function will output a new .tsv file in the form of an edge list, in the `data` folder. Since the C++ program needs to be compiled, I have already created the edge lists for the four datasets, so you can skip this step if you want.\n",
-    "del Gowalla_df\n",
+    "\n",
-    "del foursquare_NYC_df"
+    "Once that we have our edge list, we can build the graph using the function `checkins_graph_from_edges` from the `utils` module. It takes as input the name of the dataset and returns a networkx graph object. The options are\n",
    "\n",
    "- `brightkite`\n",
    "- `gowalla`\n",
    "- `foursquareNYC`\n",
    "- `foursquareTKY`\n",
    "\n",
    "Let's see how it works:"
   ]
  },
  {
-   "cell_type": "markdown",
+   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Building the networks"
+    "G_brighkite_checkins = checkins_graph_from_edges('brightkite')\n",
    "G_brighkite_checkins.name = 'Brightkite Checkins Graph'\n",
    "\n",
    "G_gowalla_checkins = checkins_graph_from_edges('gowalla')\n",
    "G_gowalla_checkins.name = 'Gowalla Checkins Graph'\n",
    "\n",
    "G_foursquareNYC_checkins = checkins_graph_from_edges('foursquareNYC')\n",
    "G_foursquareNYC_checkins.name = 'Foursquare NYC Checkins Graph'\n",
    "\n",
    "G_foursquareTKY_checkins = checkins_graph_from_edges('foursquareTKY')\n",
    "G_foursquareTKY_checkins.name = 'Foursquare TKY Checkins Graph'"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "We are asked to construct the networks for the three datasets as un undirected grah $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n",
+    "Now that we have our graphs, let's have a look at some basic information about them"
    "\n",
    "We can use the fucntion create_graph from the `utils` module to create the networks. It takes as input the path to an edge list file and returns a networkx graph object. For further details about the function below, please refer to the `utils` module."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Number of nodes added to the graph brightkite: 51406\n"
+      "Brightkite Checkins Graph\n",
-     ]
+      "Number of nodes:  44058\n",
-    },
+      "Number of edges:  106699\n",
-    {
+      "\n",
-     "ename": "KeyboardInterrupt",
+      "Gowalla Checkins Graph\n",
-     "evalue": "",
+      "Number of nodes:  44058\n",
-     "output_type": "error",
+      "Number of edges:  106699\n",
-     "traceback": [
+      "\n",
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "Foursquare NYC Checkins Graph\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Number of nodes:  2293\n",
-      "\u001b[0;32m/tmp/ipykernel_51637/2618808480.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mBrightkite_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"brightkite\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mGowalla_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"gowalla\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mFoursquare_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"foursquareNYC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "Number of edges:  31261\n",
-      "\u001b[0;32m~/github/small-worlds/main.py\u001b[0m in \u001b[0;36mcreate_checkins_graph\u001b[0;34m(dataset)\u001b[0m\n\u001b[1;32m    132\u001b[0m     \u001b[0;31m# now add the edges, try to use pandas to speed up the process\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0muser1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muser2\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcombinations\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m         \u001b[0mintersection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers_venues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers_venues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    135\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintersection\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m             \u001b[0mG\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_edge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muser1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muser2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintersection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+      "Foursquare TKY Checkins Graph\n",
      "Number of nodes:  1078\n",
      "Number of edges:  7273\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "Brightkite_G = create_checkins_graph(\"brightkite\")\n",
+    "for G in [G_brighkite_checkins, G_gowalla_checkins, G_foursquareNYC_checkins, G_foursquareTKY_checkins]:\n",
-    "Gowalla_G = create_checkins_graph(\"gowalla\")\n",
+    "    print(G.name)\n",
-    "Foursquare_G = create_checkins_graph(\"foursquareNYC\")"
+    "    print('Number of nodes: ', G.number_of_nodes())\n",
    "    print('Number of edges: ', G.number_of_edges())\n",
    "    print()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Now we can have a look at the number of nodes and edges in each network."
+    "### Friendship network\n",
    "\n",
    "If we want to build the friendship network, fortunately for the gowalla and brightkite datasets we have the edge list, so we can just use the `read_edgelist` function from networkx. For the foursquare dataset, we don't have any information about the friendship of the users, so we will just create a graph with the checkins.\n",
    "\n",
    "To build the friendship network of the first two datasets, we can use the `create_friends_graph` function from the `utils` module. It takes a dataset name as input and returns a networkx graph object. The implementation is pretty straightforward, we just use the `from_pandas_edgelist` function from networkx."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset</th>\n",
       "      <th>nodes</th>\n",
       "      <th>edges</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>brightkite</td>\n",
       "      <td>58228</td>\n",
       "      <td>214078</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>gowalla</td>\n",
       "      <td>196591</td>\n",
       "      <td>950327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>foursquare</td>\n",
       "      <td>1083</td>\n",
       "      <td>282405</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      dataset   nodes   edges\n",
       "0  brightkite   58228  214078\n",
       "1     gowalla  196591  950327\n",
       "2  foursquare    1083  282405"
      ]
     },
     "execution_count": 8,
   "metadata": {},
-     "output_type": "execute_result"
+   "outputs": [],
    }
   ],
   "source": [
-    "dataset = [\"brightkite\", \"gowalla\", \"foursquare\"]\n",
+    "G_brighkite_friends = friendships_graph('brightkite')\n",
-    "nodes = [len(Brightkite_G.nodes()), len(Gowalla_G.nodes()), len(Foursquare_G.nodes())]\n",
+    "G_brighkite_friends.name = 'Brightkite Friendship Graph'\n",
    "edges = [len(Brightkite_G.edges()), len(Gowalla_G.edges()), len(Foursquare_G.edges())]\n",
    "\n",
-    "df = pd.DataFrame({\"dataset\": dataset, \"nodes\": nodes, \"edges\": edges})\n",
+    "G_gowalla_friends = friendships_graph('gowalla')\n",
-    "df"
+    "G_gowalla_friends.name = 'Gowalla Friendship Graph'"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "As we can see, the foursquare dataset has a very small number of nodes. Even tho it has 227428 check-ins, the unique users (the nodes) are only 1083. The Tokyo dataset is about 2 times bigger, with 537703 check-ins and 2294 nodes. Since we are in the same order of magnitude, we will focus on the New York dataset, in the style of a classic Hollywood movie about aliens invasions."
+    "Now that we have our graphs, let's have a look at some basic information about them"
   ]
  },
  {
-   "cell_type": "markdown",
+   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Brightkite Friendship Graph\n",
      "Number of nodes:  58228\n",
      "Number of edges:  214078\n",
      "\n",
      "Gowalla Friendship Graph\n",
      "Number of nodes:  196591\n",
      "Number of edges:  950327\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "# Analysis of the structure of the networks"
+    "for G in [G_brighkite_friends, G_gowalla_friends]:\n",
    "    print(G.name)\n",
    "    print('Number of nodes: ', G.number_of_nodes())\n",
    "    print('Number of edges: ', G.number_of_edges())\n",
    "    print()"
   ]
  },
  {
-   "cell_type": "code",
+   "cell_type": "markdown",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
-    "path = \"data/brightkite/loc-brightkite_totalCheckins.txt\"\n",
+    "# Analysis of the structure of the networks"
    "# modify the file, take only the first and last column, return a test.txt file. Use pandas\n",
    "\n",
    "def modify_file(path):\n",
    "    df = pd.read_csv(path, sep=\"\\t\", header=None, names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
    "    df = df.iloc[:, [0, 4]]\n",
    "    df.to_csv(\"test.txt\", sep=\"\\t\", header=None, index=None)\n",
    "\n",
    "modify_file(path)"
   ]
  }
 ],